From 7ad552a161a21e9a4e77375a1047de65c607544b Mon Sep 17 00:00:00 2001 From: xjywhu Date: Mon, 16 Dec 2024 22:21:17 +0800 Subject: [PATCH] update the CogVLM utils --- CogVLM/utils/models/cogagent_model.py | 14 ++++----- CogVLM/utils/models/cogvlm_model.py | 12 +++---- CogVLM/utils/models/eva_clip_model.py | 4 +-- CogVLM/utils/split_dataset.py | 16 +++++----- CogVLM/utils/utils/__init__.py | 2 +- CogVLM/utils/utils/dataset.py | 45 --------------------------- CogVLM/utils/utils/language.py | 5 --- 7 files changed, 24 insertions(+), 74 deletions(-) diff --git a/CogVLM/utils/models/cogagent_model.py b/CogVLM/utils/models/cogagent_model.py index ea29b05..fc4967d 100644 --- a/CogVLM/utils/models/cogagent_model.py +++ b/CogVLM/utils/models/cogagent_model.py @@ -155,8 +155,8 @@ def word_embedding_forward(self, input_ids, output_cross_layer, **kw_args): return word_embedding.contiguous() class CogAgentModel(LLaMAModel): - def __init__(self, args, transformer=None, parallel_output=True, **kwargs): - super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kwargs) + def __init__(self, args, transformer=None, **kwargs): + super().__init__(args, transformer=transformer, **kwargs) self.image_length = args.image_length self.cross_image_pix = args.cross_image_pix self.add_mixin("eva", ImageMixin(args)) @@ -197,8 +197,8 @@ def forward(self, input_ids, vision_expert_mask, image_embed_mask, **kwargs): class FineTuneTrainCogAgentModel(CogAgentModel): - def __init__(self, args, transformer=None, parallel_output=True, **kw_args): - super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kw_args) + def __init__(self, args, transformer=None, **kw_args): + super().__init__(args, transformer=transformer, **kw_args) self.args = args # If you want to use model parallel with a mp_size=1 checkpoint, and meanwhile you also want to use lora, # you have to add_mixin after loading model checkpoint. @@ -218,13 +218,13 @@ def add_model_specific_args(cls, parser): from sat.model.finetune import PTuningV2Mixin from sat.model.finetune.lora2 import LoraMixin class FineTuneTestCogAgentModel(CogAgentModel): - def __init__(self, args, transformer=None, parallel_output=True, **kw_args): - super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kw_args) + def __init__(self, args, transformer=None, **kw_args): + super().__init__(args, transformer=transformer, **kw_args) if args.use_ptuning: self.add_mixin("ptuning", PTuningV2Mixin(args.num_layers, args.hidden_size // args.num_attention_heads, args.num_attention_heads, args.pre_seq_len)) if args.use_lora: self.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, layer_range=args.layer_range), reinit=True) - + self.get_mixin("eva").vit_model.add_mixin("lora", LoraMixin(args.eva_args['num_layers'], args.lora_rank, layer_range=args.layer_range), reinit=True) elif args.use_qlora: self.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, layer_range=args.layer_range, qlora=True), reinit=True) self.args = args diff --git a/CogVLM/utils/models/cogvlm_model.py b/CogVLM/utils/models/cogvlm_model.py index 37b9a4a..30600eb 100644 --- a/CogVLM/utils/models/cogvlm_model.py +++ b/CogVLM/utils/models/cogvlm_model.py @@ -98,8 +98,8 @@ def word_embedding_forward(self, input_ids, output_cross_layer, **kw_args): class CogVLMModel(LLaMAModel): - def __init__(self, args, transformer=None, parallel_output=True, **kwargs): - super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kwargs) + def __init__(self, args, transformer=None, **kwargs): + super().__init__(args, transformer=transformer, **kwargs) self.image_length = args.image_length self.add_mixin("eva", ImageMixin(args)) self.del_mixin("mlp") @@ -121,8 +121,8 @@ def forward(self, input_ids, vision_expert_mask, image_embed_mask, **kwargs): class FineTuneTrainCogVLMModel(CogVLMModel): - def __init__(self, args, transformer=None, parallel_output=True, **kw_args): - super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kw_args) + def __init__(self, args, transformer=None, **kw_args): + super().__init__(args, transformer=transformer, **kw_args) self.args = args # If you want to use model parallel with a mp_size=1 checkpoint, and meanwhile you also want to use lora, # you have to add_mixin after loading model checkpoint. @@ -142,8 +142,8 @@ def add_model_specific_args(cls, parser): from sat.model.finetune import PTuningV2Mixin from sat.model.finetune.lora2 import LoraMixin class FineTuneTestCogVLMModel(CogVLMModel): - def __init__(self, args, transformer=None, parallel_output=True, **kw_args): - super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kw_args) + def __init__(self, args, transformer=None, **kw_args): + super().__init__(args, transformer=transformer, **kw_args) if args.use_ptuning: self.add_mixin("ptuning", PTuningV2Mixin(args.num_layers, args.hidden_size // args.num_attention_heads, args.num_attention_heads, args.pre_seq_len)) if args.use_lora: diff --git a/CogVLM/utils/models/eva_clip_model.py b/CogVLM/utils/models/eva_clip_model.py index 8730f1f..6182871 100644 --- a/CogVLM/utils/models/eva_clip_model.py +++ b/CogVLM/utils/models/eva_clip_model.py @@ -102,12 +102,12 @@ def layer_forward(self, hidden_states, mask, *args, **kw_args): return output class EVA2CLIPModel(BaseModel): - def __init__(self, args, transformer=None, parallel_output=True, **kwargs): + def __init__(self, args, transformer=None, **kwargs): property = ViTProperty(args.image_size, args.patch_size, args.pre_len, args.post_len) args.max_sequence_length = property.pre_len + property.num_patches + property.post_len if 'activation_func' not in kwargs: kwargs['activation_func'] = gelu - super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kwargs) + super().__init__(args, transformer=transformer, **kwargs) self.transformer.property = property self.add_mixin("patch_embedding", ImagePatchEmbeddingMixin(args.in_channels, args.hidden_size, property)) self.add_mixin("pos_embedding", InterpolatedPositionEmbeddingMixin()) diff --git a/CogVLM/utils/split_dataset.py b/CogVLM/utils/split_dataset.py index 361149e..21a7554 100644 --- a/CogVLM/utils/split_dataset.py +++ b/CogVLM/utils/split_dataset.py @@ -10,11 +10,11 @@ def find_all_files(path, suffix=".jpg"): print(f'find {len(target_files)} files...') return target_files -all_files = find_all_files('/scr/zyanzhe/archive') -os.makedirs("/scr/zyanzhe/archive_split", exist_ok=True) -os.makedirs("/scr/zyanzhe/archive_split/train", exist_ok=True) -os.makedirs("/scr/zyanzhe/archive_split/valid", exist_ok=True) -os.makedirs("/scr/zyanzhe/archive_split/test", exist_ok=True) +all_files = find_all_files('archive') +os.makedirs("archive_split", exist_ok=True) +os.makedirs("archive_split/train", exist_ok=True) +os.makedirs("archive_split/valid", exist_ok=True) +os.makedirs("archive_split/test", exist_ok=True) import random random.seed(2023) @@ -25,11 +25,11 @@ def find_all_files(path, suffix=".jpg"): print("building train") for file in train: - shutil.move(file, os.path.join("/scr/zyanzhe/archive_split/train", file.split("/")[-1])) + shutil.move(file, os.path.join("archive_split/train", file.split("/")[-1])) print("building valid") for file in valid: - shutil.move(file, os.path.join("/scr/zyanzhe/archive_split/valid", file.split("/")[-1])) + shutil.move(file, os.path.join("archive_split/valid", file.split("/")[-1])) print("building test") for file in test: - shutil.move(file, os.path.join("/scr/zyanzhe/archive_split/test", file.split("/")[-1])) + shutil.move(file, os.path.join("archive_split/test", file.split("/")[-1])) print("done") \ No newline at end of file diff --git a/CogVLM/utils/utils/__init__.py b/CogVLM/utils/utils/__init__.py index 788ba80..b52d5db 100644 --- a/CogVLM/utils/utils/__init__.py +++ b/CogVLM/utils/utils/__init__.py @@ -2,4 +2,4 @@ from .language import llama2_tokenizer, llama2_text_processor, llama2_text_processor_inference from .vision import get_image_processor from .grounding_parser import parse_response -from .dataset import ItemDataset, HTMLDataset \ No newline at end of file +from .dataset import ItemDataset \ No newline at end of file diff --git a/CogVLM/utils/utils/dataset.py b/CogVLM/utils/utils/dataset.py index 7b7db32..2b54c9c 100644 --- a/CogVLM/utils/utils/dataset.py +++ b/CogVLM/utils/utils/dataset.py @@ -58,49 +58,4 @@ def __getitem__(self, index): return {} # other attr ret = {**img_dict, **text_dict, "question_id": uni_key} - return ret - -from datasets import load_from_disk - -class HTMLDataset(Dataset): - def __init__(self, image_processor, text_processor, args, data_dirs, cross_image_processor=None, **kwargs): - super().__init__() - self.data = self.load_data(data_dirs) - self.image_processor, self.text_processor, self.cross_image_processor = image_processor, text_processor, cross_image_processor - - def process_img(self, img): - img_dict = {'vision': self.image_processor(img)} - if self.cross_image_processor: - img_dict.update({'cross': self.cross_image_processor(img)}) - return img_dict - - def process_text(self, answer, prompt): - return self.text_processor(answer, prompt) - - def load_data(self, data_dir): - ds = load_from_disk(data_dir) - print_rank0(f"find {len(ds)} samples in all...") - return ds - - def __len__(self): - return len(self.data) - - def __getitem__(self, index): - data = self.data[index] - # img - try: - img = data['image'].convert('RGB') - except Exception as e: - print_rank0(e, level=logging.WARNING) - return {} - img_dict = self.process_img(img) - # text - label = data['text'] - uni_key = str(len(self.data)) + str(index) - text_dict = self.process_text(label, "") - if text_dict is None: - print_rank0(f"Process text failed. Please check the max_target_length & max_source_length.\n The data is {data}, index, {index}", level=logging.WARNING) - return {} - # other attr - ret = {**img_dict, **text_dict, "question_id": uni_key} return ret \ No newline at end of file diff --git a/CogVLM/utils/utils/language.py b/CogVLM/utils/utils/language.py index 6730286..d3b9dad 100644 --- a/CogVLM/utils/utils/language.py +++ b/CogVLM/utils/utils/language.py @@ -1,8 +1,3 @@ -from sat.model.official.llama_model import LLaMAModel, rotate_half -from sat.transformer_defaults import attention_fn_default, split_tensor_along_last_dim -import torch.nn.functional as F - - def base_history_to_prompt(self, query, history): prompt = '' + query return prompt