diff --git a/configs/_cluster/helios.yaml b/configs/_cluster/helios.yaml index 47dc9835..cdbed2af 100644 --- a/configs/_cluster/helios.yaml +++ b/configs/_cluster/helios.yaml @@ -21,7 +21,7 @@ infrastructure: # export pixi variables - 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi' - 'export PATH="$HOME/.pixi/bin:$PATH"' - - 'export XDG_DATA_HOME="PROJECT_HOME_PATH/data"' + - 'export XDG_DATA_HOME="$PROJECT_HOME_PATH/data"' - 'export XDG_CACHE_HOME="$PROJECT_HOME_PATH/cache"' - 'export XDG_STATE_HOME="$PROJECT_HOME_PATH/state"' diff --git a/configs/pc_project/llama_1B_importances.yaml b/configs/pc_project/llama_1B_importances.yaml index 9f6b78a2..4f1232ad 100644 --- a/configs/pc_project/llama_1B_importances.yaml +++ b/configs/pc_project/llama_1B_importances.yaml @@ -56,7 +56,7 @@ apply_functions: dataloader: ${trainer.train_dataloader} dmodel: ${common.dmodel} dff: ${common.dff} - calibration_dataset_size: 8192 # nvidia used 2k steps of 4k sequence lenght - this is the saturaion poin - longer doesnt improve miningfully + calibration_dataset_size: 8192 # nvidia used 2k steps of 4k sequence length - this is the saturation point - longer doesn't improve meaningfully seq_len: ${common.sequence_length} total_batch_size: ${trainer.train_dataloader.total_batch_size} n_blocks: ${model.encoder.n_blocks} diff --git a/configs/pc_project/llama_8B_importances.yaml b/configs/pc_project/llama_8B_importances.yaml index bb1d69cd..da704e7b 100644 --- a/configs/pc_project/llama_8B_importances.yaml +++ b/configs/pc_project/llama_8B_importances.yaml @@ -56,7 +56,7 @@ apply_functions: dataloader: ${trainer.train_dataloader} dmodel: ${common.dmodel} dff: ${common.dff} - calibration_dataset_size: 8192 # nvidia used 2k steps of 4k sequence lenght - this is the saturaion poin - longer doesnt improve miningfully + calibration_dataset_size: 8192 # nvidia used 2k steps of 4k sequence length - this is the saturation point - longer doesn't improve meaningfully seq_len: ${common.sequence_length} total_batch_size: ${trainer.train_dataloader.total_batch_size} n_blocks: ${model.encoder.n_blocks} diff --git a/main.py b/main.py index ce42d3f2..7eba3894 100644 --- a/main.py +++ b/main.py @@ -70,7 +70,7 @@ def check_env_vars(): assert int(os.environ["RANK"]) < int(os.environ["WORLD_SIZE"]) -def setup_enviroment(): +def setup_environment(): if "WORLD_SIZE" not in os.environ: logger.warning("WORLD_SIZE is not set, setting it to 1") os.environ["WORLD_SIZE"] = "1" @@ -267,7 +267,7 @@ def initialize_training_components(cfg: OmegaConf, metric_logger=None): cfg, model, learning_rate ) elif cfg.trainer.checkpoint.load.type == "nano": - # TODO! if you want to apply function on loaded model it does NOT work now, it applies function on newly inintialized model than it loads model weights + # TODO! if you want to apply function on loaded model it does NOT work now, it applies function on newly initialized model than it loads model weights model, optimizer, scheduler = get_model_optimizer_scheduler( cfg, model, learning_rate ) @@ -292,7 +292,7 @@ def initialize_training_components(cfg: OmegaConf, metric_logger=None): def run(cfg: OmegaConf, metric_logger=None): - setup_enviroment() + setup_environment() if "distributed" in cfg.trainer and cfg.trainer.distributed is not None: distributed_setup() diff --git a/src/core/utils.py b/src/core/utils.py index ab9efe33..64f8259a 100644 --- a/src/core/utils.py +++ b/src/core/utils.py @@ -46,7 +46,7 @@ def solve_config_lr( config_lr: float, ) -> tuple[ float, float -]: # TODO temporary place - move to devinitions eval+ when created +]: # TODO temporary place - move to definitions eval+ when created ret_lr, ret_exp_lr = None, None if config_lr < 1.0: ret_lr = config_lr