diff --git a/Exercise11_files/Problem1/Problem1.sh b/Exercise11_files/Problem1/Problem1.sh new file mode 100644 index 0000000..130f9f6 --- /dev/null +++ b/Exercise11_files/Problem1/Problem1.sh @@ -0,0 +1,41 @@ +#create a list of the ref sequences to input these names into muscle +for file in *.ref +do + echo $file | cut -d '.' -f1 >> ref_filenames.txt +done + +#create a list of the fasta sequences to input these names into hmmsearch +for file in *.fasta +do + echo $file | cut -d '.' -f1 >> fasta_filenames.txt +done + +#for loop starts by looping through the ref files list of 3 (sigma, transporter, sporecoat) +for file in $(cat ref_filenames.txt) +do +echo $file +#muscle alignment for the reference file in question +muscle -in ${file}.ref -out ${file}.align +#hmmbuild from the muscle alignment to create .hmm file +hmmbuild ${file}.hmm ${file}.align +#nested for loop (inside the above for loop) to loop through each organism for +#the hmm references created for each sequence feature of interest (the hmm refs that +#were created in the above for loop +for fasta in $(cat fasta_filenames.txt) +do +#create hmmsearch output in the format of refsequence_organism.hits with +#inputs refsequence.hmm and organism.fasta +hmmsearch --tblout ${file}_${fasta}.hits ${file}.hmm ${fasta}.fasta +#display the results of the hmm serach, remove lines with #, and print columns of interest +#sed to change space delimiters to commas. Write to temp.txt file +cat ${file}_${fasta}.hits | grep -v "#" | awk '{print $1, $3, $6}' | sed "s/ /,/g" > temp.txt +#loop through each line(match within the orgamism for the sequence) in temp.txt +for line in $(cat temp.txt) +do +#write to a new line in results.txt file in the format: +#organism,reference hammer search name (refsequence.hmm),three colums of info from above +echo "${fasta},${file}.hmm",$line >> results.txt +done +rm temp.txt +done +done diff --git a/Exercise11_files/Problem1/results.txt b/Exercise11_files/Problem1/results.txt new file mode 100644 index 0000000..7dda95e --- /dev/null +++ b/Exercise11_files/Problem1/results.txt @@ -0,0 +1,104 @@ +Arthrobacter,sigma.hmm,tr|A0A0P7HFJ4|A0A0P7HFJ4_9MICC,sigma,78.3 +Arthrobacter,sigma.hmm,tr|A0A0P7FPV6|A0A0P7FPV6_9MICC,sigma,72.0 +Arthrobacter,sigma.hmm,tr|A0A0P7GDV0|A0A0P7GDV0_9MICC,sigma,55.2 +Arthrobacter,sigma.hmm,tr|A0A0P7FY19|A0A0P7FY19_9MICC,sigma,53.3 +Arthrobacter,sigma.hmm,tr|A0A0P7HEI0|A0A0P7HEI0_9MICC,sigma,50.0 +Arthrobacter,sigma.hmm,tr|A0A0P7GE65|A0A0P7GE65_9MICC,sigma,46.4 +Arthrobacter,sigma.hmm,tr|A0A0P7HL11|A0A0P7HL11_9MICC,sigma,42.5 +Arthrobacter,sigma.hmm,tr|A0A0P7FHT1|A0A0P7FHT1_9MICC,sigma,40.8 +Arthrobacter,sigma.hmm,tr|A0A0N8HX11|A0A0N8HX11_9MICC,sigma,40.8 +Arthrobacter,sigma.hmm,tr|A0A0P7GAX7|A0A0P7GAX7_9MICC,sigma,22.9 +Bacillus,sigma.hmm,tr|A6CNQ6|A6CNQ6_9BACI,sigma,83.6 +Bacillus,sigma.hmm,tr|A6CSJ0|A6CSJ0_9BACI,sigma,78.1 +Bacillus,sigma.hmm,tr|A6CJY3|A6CJY3_9BACI,sigma,75.3 +Bacillus,sigma.hmm,tr|A6CU12|A6CU12_9BACI,sigma,69.6 +Bacillus,sigma.hmm,tr|A6CJR5|A6CJR5_9BACI,sigma,65.9 +Bacillus,sigma.hmm,tr|A6CHU4|A6CHU4_9BACI,sigma,64.1 +Bacillus,sigma.hmm,tr|A6CMX3|A6CMX3_9BACI,sigma,63.2 +Bacillus,sigma.hmm,tr|A6CS17|A6CS17_9BACI,sigma,62.8 +Bacillus,sigma.hmm,tr|A6CU10|A6CU10_9BACI,sigma,62.3 +Bacillus,sigma.hmm,tr|A6CJX5|A6CJX5_9BACI,sigma,59.8 +Bacillus,sigma.hmm,tr|A6CSN9|A6CSN9_9BACI,sigma,58.3 +Bacillus,sigma.hmm,tr|A6CTI9|A6CTI9_9BACI,sigma,57.7 +Bacillus,sigma.hmm,tr|A6CPD2|A6CPD2_9BACI,sigma,57.7 +Bacillus,sigma.hmm,tr|A6CIS3|A6CIS3_9BACI,sigma,57.3 +Bacillus,sigma.hmm,tr|A6CII5|A6CII5_9BACI,sigma,56.5 +Bacillus,sigma.hmm,tr|A6CI15|A6CI15_9BACI,sigma,56.3 +Bacillus,sigma.hmm,tr|A6CU83|A6CU83_9BACI,sigma,55.0 +Bacillus,sigma.hmm,tr|A6CI19|A6CI19_9BACI,sigma,51.8 +Bacillus,sigma.hmm,tr|A6CQB8|A6CQB8_9BACI,sigma,50.3 +Bacillus,sigma.hmm,tr|A6CK70|A6CK70_9BACI,sigma,47.1 +Bacillus,sigma.hmm,tr|A6CIW8|A6CIW8_9BACI,sigma,42.5 +Bacillus,sigma.hmm,tr|A6CIF2|A6CIF2_9BACI,sigma,41.1 +Bacillus,sigma.hmm,tr|A6CM50|A6CM50_9BACI,sigma,35.4 +Bacillus,sigma.hmm,tr|A6CL73|A6CL73_9BACI,sigma,33.6 +Clostridium,sigma.hmm,tr|R7MFB1|R7MFB1_9CLOT,sigma,79.7 +Clostridium,sigma.hmm,tr|R7MJY2|R7MJY2_9CLOT,sigma,79.0 +Clostridium,sigma.hmm,tr|R7MGY2|R7MGY2_9CLOT,sigma,70.7 +Clostridium,sigma.hmm,tr|R7MFJ3|R7MFJ3_9CLOT,sigma,48.3 +Clostridium,sigma.hmm,tr|R7MFY1|R7MFY1_9CLOT,sigma,46.9 +Clostridium,sigma.hmm,tr|R7MAD7|R7MAD7_9CLOT,sigma,44.6 +Clostridium,sigma.hmm,tr|R7MHN3|R7MHN3_9CLOT,sigma,40.4 +Clostridium,sigma.hmm,tr|R7MCS9|R7MCS9_9CLOT,sigma,39.9 +Flavobacterium,sigma.hmm,tr|A0A1S1J8W1|A0A1S1J8W1_9FLAO,sigma,61.9 +Flavobacterium,sigma.hmm,tr|A0A1S1J0A5|A0A1S1J0A5_9FLAO,sigma,61.8 +Flavobacterium,sigma.hmm,tr|A0A1S1J9H9|A0A1S1J9H9_9FLAO,sigma,61.4 +Flavobacterium,sigma.hmm,tr|A0A1S1JAH2|A0A1S1JAH2_9FLAO,sigma,61.2 +Flavobacterium,sigma.hmm,tr|A0A1S1J7G0|A0A1S1J7G0_9FLAO,sigma,57.5 +Flavobacterium,sigma.hmm,tr|A0A1S1J0Z8|A0A1S1J0Z8_9FLAO,sigma,56.6 +Flavobacterium,sigma.hmm,tr|A0A1S1JF58|A0A1S1JF58_9FLAO,sigma,54.3 +Flavobacterium,sigma.hmm,tr|A0A1S1JAU9|A0A1S1JAU9_9FLAO,sigma,52.5 +Flavobacterium,sigma.hmm,tr|A0A1S1JAK2|A0A1S1JAK2_9FLAO,sigma,48.2 +Flavobacterium,sigma.hmm,tr|A0A1S1J180|A0A1S1J180_9FLAO,sigma,47.8 +Flavobacterium,sigma.hmm,tr|A0A1S1J4G5|A0A1S1J4G5_9FLAO,sigma,46.1 +Flavobacterium,sigma.hmm,tr|A0A1S1J5F7|A0A1S1J5F7_9FLAO,sigma,45.8 +Flavobacterium,sigma.hmm,tr|A0A1S1J7E0|A0A1S1J7E0_9FLAO,sigma,45.1 +Flavobacterium,sigma.hmm,tr|A0A1S1J430|A0A1S1J430_9FLAO,sigma,42.6 +Flavobacterium,sigma.hmm,tr|A0A1S1J2T6|A0A1S1J2T6_9FLAO,sigma,42.3 +Flavobacterium,sigma.hmm,tr|A0A1S1JDD3|A0A1S1JDD3_9FLAO,sigma,38.2 +Flavobacterium,sigma.hmm,tr|A0A1S1J6K9|A0A1S1J6K9_9FLAO,sigma,37.7 +Flavobacterium,sigma.hmm,tr|A0A1S1J9G5|A0A1S1J9G5_9FLAO,sigma,36.5 +Flavobacterium,sigma.hmm,tr|A0A1S1J5L5|A0A1S1J5L5_9FLAO,sigma,35.8 +Flavobacterium,sigma.hmm,tr|A0A1S1J6X1|A0A1S1J6X1_9FLAO,sigma,31.1 +Flavobacterium,sigma.hmm,tr|A0A1S1JCL3|A0A1S1JCL3_9FLAO,sigma,30.5 +Flavobacterium,sigma.hmm,tr|A0A1S1J6M2|A0A1S1J6M2_9FLAO,sigma,22.2 +Flavobacterium,sigma.hmm,tr|A0A1S1J022|A0A1S1J022_9FLAO,sigma,21.0 +Flavobacterium,sigma.hmm,tr|A0A1S1J312|A0A1S1J312_9FLAO,sigma,18.1 +Limnohabitans,sigma.hmm,tr|A0A0P0LD52|A0A0P0LD52_9BURK,sigma,74.9 +Limnohabitans,sigma.hmm,tr|A0A0P0LKP5|A0A0P0LKP5_9BURK,sigma,60.9 +Limnohabitans,sigma.hmm,tr|A0A0P0M8A4|A0A0P0M8A4_9BURK,sigma,58.7 +Limnohabitans,sigma.hmm,tr|A0A0P0MBT4|A0A0P0MBT4_9BURK,sigma,49.5 +Limnohabitans,sigma.hmm,tr|A0A0P0M9M9|A0A0P0M9M9_9BURK,sigma,47.6 +Limnohabitans,sigma.hmm,tr|A0A0P0MA37|A0A0P0MA37_9BURK,sigma,46.7 +Limnohabitans,sigma.hmm,tr|A0A0P0M9Z4|A0A0P0M9Z4_9BURK,sigma,21.2 +Rhizobium,sigma.hmm,tr|A0A1Q9ANZ6|A0A1Q9ANZ6_9RHIZ,sigma,75.3 +Rhizobium,sigma.hmm,tr|A0A1Q9ANQ8|A0A1Q9ANQ8_9RHIZ,sigma,75.1 +Rhizobium,sigma.hmm,tr|A0A1Q9AL94|A0A1Q9AL94_9RHIZ,sigma,65.5 +Rhizobium,sigma.hmm,tr|A0A1Q9ALF0|A0A1Q9ALF0_9RHIZ,sigma,61.3 +Rhizobium,sigma.hmm,tr|A0A1Q9AF84|A0A1Q9AF84_9RHIZ,sigma,61.2 +Rhizobium,sigma.hmm,tr|A0A1Q9AII6|A0A1Q9AII6_9RHIZ,sigma,59.6 +Rhizobium,sigma.hmm,tr|A0A1Q9ACE9|A0A1Q9ACE9_9RHIZ,sigma,56.9 +Rhizobium,sigma.hmm,tr|A0A1Q9AIU3|A0A1Q9AIU3_9RHIZ,sigma,55.8 +Rhizobium,sigma.hmm,tr|A0A1Q9AEH2|A0A1Q9AEH2_9RHIZ,sigma,54.1 +Rhizobium,sigma.hmm,tr|A0A1Q9ANU6|A0A1Q9ANU6_9RHIZ,sigma,44.9 +Rhizobium,sigma.hmm,tr|A0A1Q9AQ99|A0A1Q9AQ99_9RHIZ,sigma,43.5 +Rhizobium,sigma.hmm,tr|A0A1Q9AQ73|A0A1Q9AQ73_9RHIZ,sigma,43.4 +Rhizobium,sigma.hmm,tr|A0A1Q9ADJ9|A0A1Q9ADJ9_9RHIZ,sigma,32.2 +Rhizobium,sigma.hmm,tr|A0A1Q9APD8|A0A1Q9APD8_9RHIZ,sigma,24.5 +Roseobacter,sigma.hmm,tr|B7RH33|B7RH33_9RHOB,sigma,75.3 +Roseobacter,sigma.hmm,tr|B7RGX5|B7RGX5_9RHOB,sigma,64.7 +Roseobacter,sigma.hmm,tr|B7RH17|B7RH17_9RHOB,sigma,63.8 +Roseobacter,sigma.hmm,tr|B7RI57|B7RI57_9RHOB,sigma,62.7 +Roseobacter,sigma.hmm,tr|B7RJL9|B7RJL9_9RHOB,sigma,62.7 +Roseobacter,sigma.hmm,tr|B7RH51|B7RH51_9RHOB,sigma,56.6 +Roseobacter,sigma.hmm,tr|B7RSA6|B7RSA6_9RHOB,sigma,49.2 +Verrucomicrobia,sigma.hmm,tr|A0A1W9LF71|A0A1W9LF71_9BACT,sigma,75.1 +Verrucomicrobia,sigma.hmm,tr|A0A1W9LBX6|A0A1W9LBX6_9BACT,sigma,68.4 +Verrucomicrobia,sigma.hmm,tr|A0A1W9LH48|A0A1W9LH48_9BACT,sigma,61.3 +Verrucomicrobia,sigma.hmm,tr|A0A1W9LHW5|A0A1W9LHW5_9BACT,sigma,49.8 +Verrucomicrobia,sigma.hmm,tr|A0A1W9LA94|A0A1W9LA94_9BACT,sigma,48.6 +Bacillus,sporecoat.hmm,tr|A6BZD2|A6CT85_9BACI,sporecoat,118.6 +Bacillus,transporter.hmm,tr|A6CMF6|A6CMF6_9BACI,transporter,24.9 +Flavobacterium,transporter.hmm,tr|A0A1S1J3R7|A0A1S1J3R7_9FLAO,transporter,33.9 +Limnohabitans,transporter.hmm,tr|A0A0P0M9B4|A0A0P0M9B4_9BURK,transporter,34.2 +Roseobacter,transporter.hmm,tr|B7RKP8|B7RKP8_9RHOB,transporter,10.3 diff --git a/Exercise11_files/Problem2/problem2.py b/Exercise11_files/Problem2/problem2.py new file mode 100644 index 0000000..660e286 --- /dev/null +++ b/Exercise11_files/Problem2/problem2.py @@ -0,0 +1,31 @@ + +import re + +motifsortin=open("motifsort.fasta","r") +motif1out=open("motif1.fasta","w") +motif2out=open("motif2.fasta","w") +nomotifout=open("nomotif.fasta","w") + +motif1= r'AKKPRVZE' +motif2= r'AAQWWRNYGG' + +tempid_m1=[] +tempid_m2=[] +tempid_no=[] +tempseq_m1=[] +tempseq_m2=[] +tempseq_no=[] + + +for line in motifsortin: + line = line.strip() + if ">" in line: + seqid = line + elif re.search(motif1, line): + motif1out.write(seqid + "\n" +line + "\n") + elif re.search(motif2, line): + motif2out.write(seqid + "\n" +line + "\n") + else: + nomotifout.write(seqid + "\n" +line + "\n") + +