diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index bc93ed10..00000000 Binary files a/.DS_Store and /dev/null differ diff --git a/README.md b/README.md index 3440b850..1ea0ec90 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Reinforcement Learning Complex Detection -This reinforcement learning algorithm is a machine learning method for complex detection in networks. Using known communities, it is trained and learns to find new complexes in the network. +This is a reinforcement learning algorithm for community detection in networks. Trained on known communities, it learns to find new communities in a network. # Installation: Required python3 @@ -10,38 +10,45 @@ Requirements installation: 1. For a toy network use input_toy.yaml 2. For hu.MAP - use input file input_humap.yaml - # Instructions: To run this pipeline on a new network, construct an input file similar to input_toy.yaml specifying where to find the required inputs. -1. Specify input options relating to network: Set options dir_nm (directory containing the network) and netf_nm (file name of the network) -2. Specify input options relating to known communities in network: If you already have sepearated known communities into train and test communitites, specify their paths in the options comf_nm and comf_test_nm (relative to the directory specified in the option:dir_nm) Otherwise, Split complex list into train and test: Set option split_flag = 1 Verify that train test size distributions in figure are the similar. Also check that number of training complexes is not too low by looking at the res_metrics.out file. Set options comf_nm and comf_test_nm with these two files. All the above paths are set relative to the directory specified in the option:dir_nm Make sure to change the option split_flag back to 0 after this step +1. Specify the network input file: Set options dir_nm (directory containing the network) and netf_nm (file name of the network) +2. Specify the paths for train and test communitites, in the options comf_nm and comf_test_nm (relative to the directory specified in the option- dir_nm) -An example bash script to run the RL pipeline after the above steps is shown below: This is for hu.MAP complexes +An example bash script to run the RL pipeline after the above steps is shown below: This is for complexes learned on the human PPI network, hu.MAP 1.0: ``` #!/bin/bash -mtype = humap -input_file_name = input_$mtype.yaml -graph_file = hu.MAP_network_experiments/input_data/humap_network_weighted_edge_lists.txt -input_training_file = hu.MAP_network_experiments/intermediate_output_results_data/training_CORUM_complexes_node_lists.txt -input_testing_file = hu.MAP_network_experiments/intermediate_output_results_data/testing_CORUM_complexes_node_lists.txt -out_dir_name = /results_$mtype -train_results = $out_dir_name/train_results -pred_results = $out_dir_name/pred_results -id_map_path = convert_ids/humap_gene_id_name_map.txt + +mtype=humap +input_file_name=input_$mtype.yaml +graph_file=hu.MAP_network/input_data/humap_network_weighted_edge_lists.txt +input_training_file=hu.MAP_network/intermediate_data/training_CORUM_complexes_node_lists.txt +input_testing_file=hu.MAP_network/intermediate_data/testing_CORUM_complexes_node_lists.txt +mkdir results_$mtype +out_dir_name=./results_$mtype +train_results=$out_dir_name/train_results +pred_results=$out_dir_name/pred_results +id_map_path=convert_ids/humap_gene_id_name_map.txt + echo Training Algorithm.... -python3 functions/main_training.py --input_training_file $input_training_file --graph_file $graph_file --train_results $train_results +python3 main_training.py --input_training_file $input_training_file --graph_file $graph_file --train_results $train_results echo Predicting new complexes from known communities... -python3 functions/main_prediction.py --graph_file $graph_file --train_results $train_results --out_dir_name $out_dir_name --pred_results $pred_results +python3 main_prediction.py --graph_file $graph_file --train_results $train_results --out_dir_name $out_dir_name --pred_results $pred_results echo Merging similar communities... -python3 functions/postprocessing.py --input_file_name $input_file_name --graph_file $graph_file --out_dir_name $out_dir_name --pred_results $pred_results --train_results $train_results --input_training_file $input_training_file --input_testing_file $input_testing_file --id_map_path $id_map_path +python3 postprocessing.py --input_file_name $input_file_name --graph_file $graph_file --out_dir_name $out_dir_name --pred_results $pred_results --train_results $train_results --input_training_file $input_training_file --input_testing_file $input_testing_file --id_map_path $id_map_path echo Comparing predicted and known communitites... -python3 functions/eval_complex_RL --input_file_name $input_file_name --input_training_file $input_training_file --input_testing_file $input_testing_file --out_dir_name $out_dir_name +python3 eval_complex_RL.py --input_file_name $input_file_name --input_training_file $input_training_file --input_testing_file $input_testing_file --out_dir_name $out_dir_name --id_name_path $id_map_path ``` -# Additional tips: +## Additional tips: For each of the scripts, optional arguments can be viewed by running: python3 script_name.py --help For each command, add the desired argument directly on the terminal. + +# References: +M. V. Palukuri, R. S. Patil, and E. M. Marcotte, “Molecular complex detection in protein interaction networks through reinforcement learning.” bioRxiv, p. 2022.06.20.496772. doi: [10.1101/2022.06.20.496772](https://www.biorxiv.org/content/10.1101/2022.06.20.496772v1). + +Interactive visualizations of complexes learned by the RL algorithm on two human PPI networks, hu.MAP 1.0 and hu.MAP 2.0 are available here: [https://marcottelab.github.io/RL_humap_prediction/](https://marcottelab.github.io/RL_humap_prediction/) diff --git a/convert_humap_ids2names.py b/convert_humap_ids2names.py index d68a3e18..7b398331 100644 --- a/convert_humap_ids2names.py +++ b/convert_humap_ids2names.py @@ -184,6 +184,6 @@ def convert2names_wscores(complexes, filename, G, filename_edges, ids_map): convert_edges_wscore(lines, G, filename_edges, id_name_map) -def convert2names_wscores_matches(complex_matches, filename): - id_name_map = read_gene_id_map() +def convert2names_wscores_matches(complex_matches, filename, id_name_map_path): + id_name_map = read_gene_id_map(id_name_map_path) convert_nodes_matches_wscore(complex_matches, filename, id_name_map) diff --git a/eval_cmplx_sc.py b/eval_cmplx_sc.py index e2287b7d..0a2c5ef7 100644 --- a/eval_cmplx_sc.py +++ b/eval_cmplx_sc.py @@ -16,11 +16,11 @@ -def write_best_matches(best_matches_for_known,out_comp_nm,dir_nm,suffix): +def write_best_matches(best_matches_for_known,out_comp_nm,dir_nm,suffix,id_name_map): sorted_matches = sorted(best_matches_for_known,key=lambda x: x[2],reverse=True) if dir_nm == "humap": - convert2names_wscores_matches(sorted_matches, out_comp_nm + suffix + '_known_pred_matches_names.out') + convert2names_wscores_matches(sorted_matches, out_comp_nm + suffix + '_known_pred_matches_names.out',id_name_map) with open(out_comp_nm + suffix + '_known_pred_matches.out', "w") as fn: fn_write = fn.write @@ -127,7 +127,7 @@ def f1_similarity(P,T): return F1_score, C -def one2one_matches(known_complex_nodes_list, fin_list_graphs, N_pred_comp, N_test_comp,out_comp_nm,suffix,dir_nm): +def one2one_matches(known_complex_nodes_list, fin_list_graphs, N_pred_comp, N_test_comp,out_comp_nm,suffix,dir_nm, id_name_map): Metric = np_zeros((N_test_comp, N_pred_comp)) Common_nodes = np_zeros((N_test_comp, N_pred_comp)) @@ -174,8 +174,8 @@ def one2one_matches(known_complex_nodes_list, fin_list_graphs, N_pred_comp, N_te avg_f1_score = (avged_f1_score4known + avged_f1_score4pred)/2 net_f1_score = 2 * avged_f1_score4known * avged_f1_score4pred / (avged_f1_score4known + avged_f1_score4pred) - write_best_matches(best_matches_4known,out_comp_nm,dir_nm,'_best4known' + suffix) - write_best_matches(best_matches_4predicted,out_comp_nm,dir_nm,'_best4predicted' + suffix) + write_best_matches(best_matches_4known,out_comp_nm,dir_nm,'_best4known' + suffix, id_name_map) + write_best_matches(best_matches_4predicted,out_comp_nm,dir_nm,'_best4predicted' + suffix, id_name_map) prec_MMR, recall_MMR, f1_MMR, max_matching_edges = f1_mmr(Metric) @@ -295,28 +295,28 @@ def remove_unknown_prots(fin_list_graphs_orig, prot_list): return fin_list_graphs -def compute_metrics(known_complex_nodes_list, fin_list_graphs,out_comp_nm,N_test_comp,N_pred_comp,inputs,suffix): +def compute_metrics(known_complex_nodes_list, fin_list_graphs,out_comp_nm,N_test_comp,N_pred_comp,inputs,suffix, id_name_map): if N_test_comp != 0 and N_pred_comp != 0: Precision, Recall, F1_score = node_comparison_prec_recall(known_complex_nodes_list,fin_list_graphs, N_pred_comp, N_test_comp, inputs["eval_p"],out_comp_nm+suffix) - avg_f1_score, net_f1_score,PPV,Sn,acc_unbiased,prec_MMR, recall_MMR, f1_MMR,n_matches = one2one_matches(known_complex_nodes_list, fin_list_graphs, N_pred_comp, N_test_comp,out_comp_nm,suffix,inputs['dir_nm']) + avg_f1_score, net_f1_score,PPV,Sn,acc_unbiased,prec_MMR, recall_MMR, f1_MMR,n_matches = one2one_matches(known_complex_nodes_list, fin_list_graphs, N_pred_comp, N_test_comp,out_comp_nm,suffix,inputs['dir_nm'], id_name_map) with open(out_comp_nm + '_metrics.out', "a") as fid: print("No. of matches in MMR = ", n_matches, file=fid) - print("MMR Precision = %.3f" % prec_MMR, file=fid) - print("MMR Recall = %.3f" % recall_MMR, file=fid) - print("MMR F1 score = %.3f" % f1_MMR, file=fid) - print("Net F1 score = %.3f" % net_f1_score, file=fid) + print("FMM Precision = %.3f" % prec_MMR, file=fid) + print("FMM Recall = %.3f" % recall_MMR, file=fid) + print("FMM F1 score = %.3f" % f1_MMR, file=fid) + print("CMMF = %.3f" % net_f1_score, file=fid) print("Unbiased PPV = %.3f" % PPV, file=fid) print("Unbiased Sn = %.3f" % Sn, file=fid) - print("Unbiased accuracy= %.3f" % acc_unbiased, file=fid) + print("Unbiased accuracy (UnSPA)= %.3f" % acc_unbiased, file=fid) print("Net Averaged F1 score (Average of Precision and Recall based on F1 score) = %.3f" % avg_f1_score, file=fid) - print("Prediction Precision = %.3f" % Precision, file=fid) - print("Prediction Recall = %.3f" % Recall, file=fid) - print("Prediction F1 score = %.3f" % F1_score, file=fid) + print("Qi et al Precision = %.3f" % Precision, file=fid) + print("Qi et al Recall = %.3f" % Recall, file=fid) + print("Qi et al F1 score = %.3f" % F1_score, file=fid) -def eval_complex(rf=0, rf_nm=0, inputs={}, known_complex_nodes_list=[], prot_list=[], fin_list_graphs=[], out_comp_nm = '',suffix="both"): +def eval_complex(rf=0, rf_nm=0, inputs={}, known_complex_nodes_list=[], prot_list=[], fin_list_graphs=[], out_comp_nm = '',suffix="both", id_name_map = ""): # rf - read flag to read complexes from file logging_info("Evaluating complexes..." + suffix) if rf == 1: @@ -338,7 +338,7 @@ def eval_complex(rf=0, rf_nm=0, inputs={}, known_complex_nodes_list=[], prot_lis print("No. of Predicted complexes = ", N_pred_comp, file=fid) print("\n -- Metrics on complexes with all proteins -- ", file=fid) print(out_comp_nm) - compute_metrics(known_complex_nodes_list, fin_list_graphs, out_comp_nm,N_test_comp,N_pred_comp,inputs,suffix+'_all_prots') + compute_metrics(known_complex_nodes_list, fin_list_graphs, out_comp_nm,N_test_comp,N_pred_comp,inputs,suffix+'_all_prots',id_name_map) fin_list_graphs = remove_unknown_prots(fin_list_graphs, prot_list) plot_size_dists(known_complex_nodes_list, fin_list_graphs, sizes_orig, out_comp_nm) @@ -348,8 +348,8 @@ def eval_complex(rf=0, rf_nm=0, inputs={}, known_complex_nodes_list=[], prot_lis print("No. of Predicted complexes after removing non-gold std proteins = ", N_pred_comp, file=fid) print("\n -- Metrics on complexes with only gold std proteins -- ", file=fid) - compute_metrics(known_complex_nodes_list, fin_list_graphs, out_comp_nm,N_test_comp,N_pred_comp,inputs,suffix+'_gold_std_prots') + compute_metrics(known_complex_nodes_list, fin_list_graphs, out_comp_nm,N_test_comp,N_pred_comp,inputs,suffix+'_gold_std_prots', id_name_map) with open(out_comp_nm + '_metrics.out', "a") as fid: print("-- Finished writing main metrics -- \n", file=fid) - logging_info("Finished evaluating basic metrics for complexes " + suffix) \ No newline at end of file + logging_info("Finished evaluating basic metrics for complexes " + suffix) diff --git a/eval_complex_RL.py b/eval_complex_RL.py index 19f15ace..5e09a0ad 100644 --- a/eval_complex_RL.py +++ b/eval_complex_RL.py @@ -1,8 +1,7 @@ from argparse import ArgumentParser as argparse_ArgumentParser, ArgumentParser from pickle import load as pickle_load from yaml import load as yaml_load, dump as yaml_dump, Loader as yaml_Loader -from eval_cmplx_sc import eval_complex -from eval_cmplx_sc import remove_unknown_prots +from eval_cmplx_sc import eval_complex, remove_unknown_prots from main6_eval import run_metrics import os def main(): @@ -13,6 +12,7 @@ def main(): parser.add_argument("--input_testing_file", default="", help="Testing Graph file path") parser.add_argument("--out_dir_name", default="", help="Output directory name") parser.add_argument("--evaluate_additional_metrics", default=1, help="complexes file name") + parser.add_argument("--id_name_path", default="", help="Path for id to gene name file") args = parser.parse_args() print(args.input_file_name) with open(args.input_file_name, 'r') as f: @@ -52,7 +52,7 @@ def main(): # Remove all proteins in Predicted complexes that are not present in known complex protein list fin_list_graphs = remove_unknown_prots(fin_list_graphs_orig, prot_list) suffix = '' - eval_complex(0, 0, inputs, known_complex_nodes_list, prot_list, fin_list_graphs, out_comp_nm, suffix="_train") + eval_complex(0, 0, inputs, known_complex_nodes_list, prot_list, fin_list_graphs, out_comp_nm, suffix="_train", id_name_map = args.id_name_path) if args.evaluate_additional_metrics: try: run_metrics(known_complex_nodes_list, fin_list_graphs, out_comp_nm, "_train") @@ -75,7 +75,7 @@ def main(): # Remove all proteins in Predicted complexes that are not present in known complex protein list fin_list_graphs = remove_unknown_prots(fin_list_graphs_orig, prot_list) suffix = '' - eval_complex(0, 0, inputs, known_complex_nodes_list, prot_list, fin_list_graphs, out_comp_nm,suffix="_train") + eval_complex(0, 0, inputs, known_complex_nodes_list, prot_list, fin_list_graphs, out_comp_nm,suffix="_train", id_name_map = args.id_name_path) if args.evaluate_additional_metrics: try: @@ -109,7 +109,7 @@ def main(): N_pred_comp = len(fin_list_graphs) suffix = '' - eval_complex(0, 0, inputs, known_complex_nodes_list, prot_list, fin_list_graphs, out_comp_nm,suffix="_train") + eval_complex(0, 0, inputs, known_complex_nodes_list, prot_list, fin_list_graphs, out_comp_nm,suffix="_train", id_name_map = args.id_name_path) if args.evaluate_additional_metrics: try: diff --git a/hu.MAP_network/.DS_Store b/hu.MAP_network/.DS_Store deleted file mode 100644 index 15d62dd4..00000000 Binary files a/hu.MAP_network/.DS_Store and /dev/null differ diff --git a/humap_steps.sh b/humap_steps.sh new file mode 100755 index 00000000..bccb6f6f --- /dev/null +++ b/humap_steps.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +mtype=humap +input_file_name=input_$mtype.yaml +graph_file=hu.MAP_network/input_data/humap_network_weighted_edge_lists.txt +input_training_file=hu.MAP_network/intermediate_data/training_CORUM_complexes_node_lists.txt +input_testing_file=hu.MAP_network/intermediate_data/testing_CORUM_complexes_node_lists.txt +mkdir results_$mtype +out_dir_name=./results_$mtype +train_results=$out_dir_name/train_results +pred_results=$out_dir_name/pred_results +id_map_path=convert_ids/humap_gene_id_name_map.txt + +echo Training Algorithm.... +python3 main_training.py --input_training_file $input_training_file --graph_file $graph_file --train_results $train_results + +echo Predicting new complexes from known communities... +python3 main_prediction.py --graph_file $graph_file --train_results $train_results --out_dir_name $out_dir_name --pred_results $pred_results + +echo Merging similar communities... +python3 postprocessing.py --input_file_name $input_file_name --graph_file $graph_file --out_dir_name $out_dir_name --pred_results $pred_results --train_results $train_results --input_training_file $input_training_file --input_testing_file $input_testing_file --id_map_path $id_map_path + +echo Comparing predicted and known communitites... +python3 eval_complex_RL.py --input_file_name $input_file_name --input_training_file $input_training_file --input_testing_file $input_testing_file --out_dir_name $out_dir_name --id_name_path $id_map_path diff --git a/input_humap.yaml b/input_humap.yaml index 525b6826..d75a259f 100644 --- a/input_humap.yaml +++ b/input_humap.yaml @@ -6,43 +6,11 @@ comf_nm: "/res_train_complexes_new_73_more.txt" comf_test_nm: "/res_test_complexes_new_73_more.txt" # Make sure no extra rows are present comf_nm_all: "/all_complexes.txt" out_comp_nm: "/results_qi0.325/res" -scale_factor: 10 # Number of times negatives should be higher than positives -use_full: 1 split_flag: 0 -fact: 0.99 -perc_transfer: 0.275 -mode: non_gen # gen means only feature extraction, non_gen is all # -------------------Training parameters-------------------------------- -feats: 6 -classifier_file: "humap/results_73_neg_unif_10x/res_classifiers_new.txt" # or remove new - CHECK -model_type: "tpot" # Options: tpot, NN -train_feat_mat: "humap/results_73_neg_unif_10x/res_train_dat.csv" -test_feat_mat: "humap/results_73_neg_unif_10x/res_test_dat.csv" -model_name: "tpot_select" #Options: FF_1hidden, log_reg, SVM, rand_forest, extra_trees, estimator_SVM model_dir: "/results_73_neg_unif_10x/res" # --------------------Search parameters ------------------------------ -seed_mode: "all_nodes" # Options:all_nodes_known_comp, all_nodes, n_nodes,cliques -num_comp: 5 # Options: 10, 7778, 1500 - only for n_nodes mode -classi_thresh: 0.5 - -run_mode: "parallel" # Options: serial, parallel -max_size_thres: 11 -search_method: "isa" # isa, metropolis, search_top_neigs, search_max_neig - -# All methods except max_neig -use_all_neigs: 1 -thres_neig: 30 # Maximum number of neighbors sampled for checking -min_thres_neig_sorted: 100 # Threshold above which only a percentage of neigs are considered as per sorted weights -perc: 0.7 # Percentage of neighbors to check for adding new node -explore_prob: 0.01 # use 0.1 for top_neigs - -# Metropolis algorithm params -prob_metropolis: 0.1 - -# ISA params -T0: 1.75 -alpha: 0.005 over_t: 0.325 # Overlap threshold = 0.7/0.9 overlap_method: "qi" # testing_qi_0.3 or 1 diff --git a/input_toy.yaml b/input_toy.yaml index 3dd71693..2678ae56 100644 --- a/input_toy.yaml +++ b/input_toy.yaml @@ -3,44 +3,10 @@ dir_nm: "toy_network" # Options: toy_network, toy_network_old, humap, humap2 sep: " " # Options: " ", "\t" out_comp_nm: "/results/res" split_flag: 0 -fact: 0.7 -perc_transfer: 0.2 -use_full: 1 -scale_factor: 1.1 # Number of times negatives should be higher than positives -mode: non_gen # gen means only feature extraction, non_gen is all # -------------------Training parameters-------------------------------- -feats: 6 - -model_type: "tpot" # Options: tpot, NN -train_feat_mat: "toy_network/results_train_dat.csv" -test_feat_mat: "toy_network/results_train_dat.csv" -model_name: "SVM" #Options: FF_1hidden, log_reg, SVM, rand_forest, extra_trees -# humap with separted train and test sets - tpot result - extra_trees model_dir: "/results/res" # --------------------Search parameters ------------------------------ -seed_mode: "all_nodes" # Options:all_nodes_known_comp, all_nodes, n_nodes, cliques -num_comp: 40 # Options: 10, 7778, 1500 - only for n_nodes mode - -run_mode: "parallel" # Options: serial, parallel -max_size_thres: 50 - -search_method: "search_top_neigs" # isa, metropolis, search_top_neigs, search_max_neig - -# All methods except search_max_neig -# No. of neighbors considered params -use_all_neigs: 1 -thres_neig: 30 # Maximum number of neighbors sampled for checking -min_thres_neig_sorted: 30 # Threshold above which only a percentage of neigs are considered as per sorted weights -perc: 0.7 # Percentage of neighbors to check for adding new node - -explore_prob: 0.01 # use 0.1 for top_neigs -# Metropolis algorithm params -prob_metropolis: 0.1 - -# ISA params -T0: 0.88 -alpha: 1.8 over_t: 0.1 # Overlap threshold = 0.7/0.9 infer_overlap_threshold: "y" diff --git a/main6_eval.py b/main6_eval.py index baf11886..39ae2f5a 100644 --- a/main6_eval.py +++ b/main6_eval.py @@ -21,7 +21,7 @@ sys_path.insert(1, 'functions_py3/') from yaml import load as yaml_load, dump as yaml_dump, Loader as yaml_Loader from argparse import ArgumentParser as argparse_ArgumentParser -from humap.functions.eval_cmplx_sc import eval_complex +from eval_cmplx_sc import eval_complex # from random_walk_control import control from logging import basicConfig as logging_basicConfig, INFO as logging_INFO @@ -93,10 +93,7 @@ def main(): parser = argparse_ArgumentParser("Input parameters") parser.add_argument("--input_file_name", default="input_toy.yaml", help="Input parameters file name") parser.add_argument("--out_dir_name", default="/results", help="Output directory name, by default - /results") - parser.add_argument("--seed_mode", help="Seed mode - specify 'cliques' for the cliques algo") parser.add_argument("--train_test_files_dir", default="", help="Train test file path") - - parser.add_argument("--search_method", help="Sampling algorithm") parser.add_argument("--model_dir", help="Directory containing model") parser.add_argument("--python_command", default="python", help="python / python3") parser.add_argument("--read_flag", default=0, help="1 when you want to read from file for evaluation") diff --git a/main_prediction.py b/main_prediction.py index cb811709..dc6c7c32 100644 --- a/main_prediction.py +++ b/main_prediction.py @@ -113,14 +113,21 @@ def pred_complex(n, nodes_list, G, gg, value_functions, intervals,args): # args.pred_results = "../results/pred_results" file = args.pred_results + '/nodes_complexes/' + with open(file + str(n), 'wb') as f: pickle_dump(tup_cmplx, f) with open(file + str(n), 'rb') as f: pickle_load(f) +import os def network(G, gg, nodes, intervals, value_functions,args): ## input data + + fol = args.pred_results + '/nodes_complexes/' + if not os.path.exists(fol): + os.mkdir(fol) + nodes_list = list(nodes) # make sure all intervals are accounted for for i in intervals: @@ -128,6 +135,8 @@ def network(G, gg, nodes, intervals, value_functions,args): val_fn = interpolate(value_functions, i) value_functions[i] = val_fn filename = args.pred_results + '/value_fns_pred.pkl' + if not os.path.exists(args.pred_results): + os.mkdir(args.pred_results) with open(filename, 'wb') as f: pickle.dump(value_functions, f) fname = args.pred_results + '/value_fns_interp.txt' @@ -135,7 +144,11 @@ def network(G, gg, nodes, intervals, value_functions,args): f.write(str(value_functions)) # parallel running - num_cores = mul_cpu_count() + if args.n_cores == "all": + num_cores = mul_cpu_count() + else: + num_cores = int(args.n_cores) + print("No. of cores used = ",num_cores) Parallel(n_jobs=num_cores, backend='loky')( delayed(pred_complex)(node, nodes_list, G, gg, value_functions, intervals,args) for node in tqdm(nodes_list)) @@ -162,6 +175,7 @@ def main(): parser.add_argument("--train_results", default="", help="Directory for training results") parser.add_argument("--pred_results", default="", help="Directory for prediction results") parser.add_argument("--out_dir_name", default = "", help = 'Main output directory') + parser.add_argument("--n_cores", default = "all", help = 'No. of cores to use for parallel processing') args = parser.parse_args() #os.makedirs(args.pred_results + '/nodes_complexes', exist_ok=True) diff --git a/main_training.py b/main_training.py index 9266506c..25845b2e 100755 --- a/main_training.py +++ b/main_training.py @@ -131,12 +131,16 @@ def network(G, gg, value_dict, dens_counter, valuefn_update, intervals, subgraph return gg # e += 1 +import os def main(): start_time = time.time() matplotlib.use('Agg') logging.basicConfig(level=logging.WARNING) - matplotlib.use('tkagg') + try: + matplotlib.use('tkagg') + except: + print("Can't use tkagg backend") # input data parser = argparse_ArgumentParser("Input parameters") parser.add_argument("--input_training_file", default="", help="Training Complexes file path") @@ -181,6 +185,8 @@ def main(): network(G, gg, value_dict, dens_counter, valuefn_update, intervals, subgraphs) # save value function scores in dictionary #args.train_results = "../results/train_results" + if not os.path.exists(args.train_results): + os.mkdir(args.train_results) fname = args.train_results + "/value_fn_dens_dict.txt" file = open(fname, "w") value_dict_sorted = sorted(value_dict.items()) diff --git a/postprocessing.py b/postprocessing.py index 3ee52121..bd5c577c 100644 --- a/postprocessing.py +++ b/postprocessing.py @@ -61,15 +61,22 @@ def main(): file = '' if inputs['dir_nm'] == 'toy_network': file = args.out_dir_name + '/qi_results' + + if not os.path.exists(args.out_dir_name + '/qi_results'): + os.mkdir(args.out_dir_name + '/qi_results') filename = file + '/res' else: if inputs['overlap_method'] == 'qi': file = args.out_dir_name + '/qi_results' + if not os.path.exists(args.out_dir_name + '/qi_results'): + os.mkdir(args.out_dir_name + '/qi_results') #os.makedirs(args.out_dir_name + '/qi_results', exist_ok=True) filename = file + '/res' # inputs['out_comp_nm'] #os.makedirs(file + '/results_qi', exist_ok=True) elif inputs["overlap_method"] == '1': # jaccard coeff file = args.out_dir_name + '/jacc_results' + if not os.path.exists(file): + os.mkdir(file) #os.makedirs(args.out_dir_name + '/jacc_results', exist_ok=True) filename = file + '/res' # inputs['out_comp_nm'] #os.makedirs(file + '/results_jacc', exist_ok=True) diff --git a/read_complexes.py b/read_complexes.py index af10f6cc..4cfc3a7e 100644 --- a/read_complexes.py +++ b/read_complexes.py @@ -9,7 +9,7 @@ from jaccard_coeff import jaccard_coeff from numpy import mean as np_mean, argmax as np_argmax, var as np_var, sqrt as sqrt from numpy.random import permutation as rand_perm, choice as rand_choice -from logging import info as logging_info +from logging import info as logging_info, debug as debug_info from networkx import write_weighted_edgelist as nx_write_weighted_edgelist, is_connected as nx_is_connected from scipy.stats import norm as norm_dist from convert_humap_ids2names import convert2names @@ -199,55 +199,6 @@ def split_ratio(perm_lines, ratio): return train_list, test_list -def split_meth_orig(perm_lines, inputs): - fact = inputs['fact'] # 0.99 - split_pt = int(round(len(perm_lines) * fact)) - train_list = [line for line in perm_lines[0:split_pt]] - test_list = [line for line in perm_lines[split_pt:]] - # Start with something that has a biased size distribution !! - - sizes = [len(line) for line in train_list] - train_mean = np_mean(sizes) - - # Transferring some of the smaller complexes to the test list - train_list_lower_mean = [line for line in train_list if len(line) < train_mean] - perc_transfer = inputs['perc_transfer'] # 0.3 # You can optimize these parameters ! - to_transfer = train_list_lower_mean[:int(round(len(train_list_lower_mean) * perc_transfer))] - test_list = test_list + to_transfer - - # Now remove from train set - for line in to_transfer: - train_list.remove(line) - - # Finding complexes in train that share an edge with a complex in test - com_comp = 10 - while com_comp != 0: # Do until train and test sets are completely separated - - # Removing super huge complexes also (nodes >30 ) from test set - test_list = [line for line in test_list if len(line) < 30] - - # REMOVE OVERLAP B/W TRAIN AND TEST DATA - # Remove complexes from train set sharing two proteins with test set - train_rem = [] - train_rem_append = train_rem.append - com_comp = 0 - for train_line in train_list: - pres = 0 - for test_line in test_list: - common = len(set(train_line.edges()).intersection(set(test_line.edges))) - if common >= 1: - pres = 1 - break - if pres == 1: - train_rem_append(train_line) - com_comp += 1 - - logging_info("No. of train complexes transferred = %s", str(com_comp)) - test_list = test_list + train_rem - for t_line in train_rem: - train_list.remove(t_line) - return train_list, test_list - def merge_overlapped(list_comp,overlap_thres = 0.6): logging_info("Merging complexes...") @@ -354,7 +305,6 @@ def split_train_test_complexes(inputs, G): perm_lines = rand_perm(complexes) ratio = (70, 30) - # train_list, test_list = split_meth_orig(perm_lines, inputs) train_list, test_list = split_ratio(perm_lines, ratio) plot_size_dists(train_list, test_list, out_comp_nm) with open(out_comp_nm + "_train_complexes_new.txt", "w") as f: @@ -366,8 +316,6 @@ def split_train_test_complexes(inputs, G): f.write(sep.join(line) + "\n") with open(out_comp_nm + '_metrics.out', "a") as fid: print("Split ratio = %.3f" % str(float(len(train_list)) / len(test_list)), file=fid) - # print("Initial train_test split = ", fact, file=fid) - # print("Percentage of low sizes transferred from train to test = ", perc_transfer, file=fid) return train_list, test_list diff --git a/requirements_py3.txt b/requirements_py3.txt index 78c0bc1c..c6d28759 100644 --- a/requirements_py3.txt +++ b/requirements_py3.txt @@ -8,13 +8,5 @@ joblib==1.1.0 tqdm==4.63.0 numpy==1.22.0 pandas==1.4.1 -scikit-MDR==0.4.4 -deap==1.3.1 -update-checker==0.18.0 -stopit==1.1.2 -TPOT==0.11.7 seaborn==0.11.2 -xgboost -tensorflow -pytest pypiwin32 \ No newline at end of file diff --git a/toy_network/.DS_Store b/toy_network/.DS_Store deleted file mode 100644 index 5008ddfc..00000000 Binary files a/toy_network/.DS_Store and /dev/null differ