Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions CogVLM/utils/models/cogagent_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,8 @@ def word_embedding_forward(self, input_ids, output_cross_layer, **kw_args):
return word_embedding.contiguous()

class CogAgentModel(LLaMAModel):
def __init__(self, args, transformer=None, parallel_output=True, **kwargs):
super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kwargs)
def __init__(self, args, transformer=None, **kwargs):
super().__init__(args, transformer=transformer, **kwargs)
self.image_length = args.image_length
self.cross_image_pix = args.cross_image_pix
self.add_mixin("eva", ImageMixin(args))
Expand Down Expand Up @@ -197,8 +197,8 @@ def forward(self, input_ids, vision_expert_mask, image_embed_mask, **kwargs):


class FineTuneTrainCogAgentModel(CogAgentModel):
def __init__(self, args, transformer=None, parallel_output=True, **kw_args):
super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kw_args)
def __init__(self, args, transformer=None, **kw_args):
super().__init__(args, transformer=transformer, **kw_args)
self.args = args
# If you want to use model parallel with a mp_size=1 checkpoint, and meanwhile you also want to use lora,
# you have to add_mixin after loading model checkpoint.
Expand All @@ -218,13 +218,13 @@ def add_model_specific_args(cls, parser):
from sat.model.finetune import PTuningV2Mixin
from sat.model.finetune.lora2 import LoraMixin
class FineTuneTestCogAgentModel(CogAgentModel):
def __init__(self, args, transformer=None, parallel_output=True, **kw_args):
super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kw_args)
def __init__(self, args, transformer=None, **kw_args):
super().__init__(args, transformer=transformer, **kw_args)
if args.use_ptuning:
self.add_mixin("ptuning", PTuningV2Mixin(args.num_layers, args.hidden_size // args.num_attention_heads, args.num_attention_heads, args.pre_seq_len))
if args.use_lora:
self.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, layer_range=args.layer_range), reinit=True)

self.get_mixin("eva").vit_model.add_mixin("lora", LoraMixin(args.eva_args['num_layers'], args.lora_rank, layer_range=args.layer_range), reinit=True)
elif args.use_qlora:
self.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, layer_range=args.layer_range, qlora=True), reinit=True)
self.args = args
Expand Down
12 changes: 6 additions & 6 deletions CogVLM/utils/models/cogvlm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ def word_embedding_forward(self, input_ids, output_cross_layer, **kw_args):


class CogVLMModel(LLaMAModel):
def __init__(self, args, transformer=None, parallel_output=True, **kwargs):
super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kwargs)
def __init__(self, args, transformer=None, **kwargs):
super().__init__(args, transformer=transformer, **kwargs)
self.image_length = args.image_length
self.add_mixin("eva", ImageMixin(args))
self.del_mixin("mlp")
Expand All @@ -121,8 +121,8 @@ def forward(self, input_ids, vision_expert_mask, image_embed_mask, **kwargs):


class FineTuneTrainCogVLMModel(CogVLMModel):
def __init__(self, args, transformer=None, parallel_output=True, **kw_args):
super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kw_args)
def __init__(self, args, transformer=None, **kw_args):
super().__init__(args, transformer=transformer, **kw_args)
self.args = args
# If you want to use model parallel with a mp_size=1 checkpoint, and meanwhile you also want to use lora,
# you have to add_mixin after loading model checkpoint.
Expand All @@ -142,8 +142,8 @@ def add_model_specific_args(cls, parser):
from sat.model.finetune import PTuningV2Mixin
from sat.model.finetune.lora2 import LoraMixin
class FineTuneTestCogVLMModel(CogVLMModel):
def __init__(self, args, transformer=None, parallel_output=True, **kw_args):
super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kw_args)
def __init__(self, args, transformer=None, **kw_args):
super().__init__(args, transformer=transformer, **kw_args)
if args.use_ptuning:
self.add_mixin("ptuning", PTuningV2Mixin(args.num_layers, args.hidden_size // args.num_attention_heads, args.num_attention_heads, args.pre_seq_len))
if args.use_lora:
Expand Down
4 changes: 2 additions & 2 deletions CogVLM/utils/models/eva_clip_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,12 @@ def layer_forward(self, hidden_states, mask, *args, **kw_args):
return output

class EVA2CLIPModel(BaseModel):
def __init__(self, args, transformer=None, parallel_output=True, **kwargs):
def __init__(self, args, transformer=None, **kwargs):
property = ViTProperty(args.image_size, args.patch_size, args.pre_len, args.post_len)
args.max_sequence_length = property.pre_len + property.num_patches + property.post_len
if 'activation_func' not in kwargs:
kwargs['activation_func'] = gelu
super().__init__(args, transformer=transformer, parallel_output=parallel_output, **kwargs)
super().__init__(args, transformer=transformer, **kwargs)
self.transformer.property = property
self.add_mixin("patch_embedding", ImagePatchEmbeddingMixin(args.in_channels, args.hidden_size, property))
self.add_mixin("pos_embedding", InterpolatedPositionEmbeddingMixin())
Expand Down
16 changes: 8 additions & 8 deletions CogVLM/utils/split_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ def find_all_files(path, suffix=".jpg"):
print(f'find {len(target_files)} files...')
return target_files

all_files = find_all_files('/scr/zyanzhe/archive')
os.makedirs("/scr/zyanzhe/archive_split", exist_ok=True)
os.makedirs("/scr/zyanzhe/archive_split/train", exist_ok=True)
os.makedirs("/scr/zyanzhe/archive_split/valid", exist_ok=True)
os.makedirs("/scr/zyanzhe/archive_split/test", exist_ok=True)
all_files = find_all_files('archive')
os.makedirs("archive_split", exist_ok=True)
os.makedirs("archive_split/train", exist_ok=True)
os.makedirs("archive_split/valid", exist_ok=True)
os.makedirs("archive_split/test", exist_ok=True)

import random
random.seed(2023)
Expand All @@ -25,11 +25,11 @@ def find_all_files(path, suffix=".jpg"):

print("building train")
for file in train:
shutil.move(file, os.path.join("/scr/zyanzhe/archive_split/train", file.split("/")[-1]))
shutil.move(file, os.path.join("archive_split/train", file.split("/")[-1]))
print("building valid")
for file in valid:
shutil.move(file, os.path.join("/scr/zyanzhe/archive_split/valid", file.split("/")[-1]))
shutil.move(file, os.path.join("archive_split/valid", file.split("/")[-1]))
print("building test")
for file in test:
shutil.move(file, os.path.join("/scr/zyanzhe/archive_split/test", file.split("/")[-1]))
shutil.move(file, os.path.join("archive_split/test", file.split("/")[-1]))
print("done")
2 changes: 1 addition & 1 deletion CogVLM/utils/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
from .language import llama2_tokenizer, llama2_text_processor, llama2_text_processor_inference
from .vision import get_image_processor
from .grounding_parser import parse_response
from .dataset import ItemDataset, HTMLDataset
from .dataset import ItemDataset
45 changes: 0 additions & 45 deletions CogVLM/utils/utils/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,49 +58,4 @@ def __getitem__(self, index):
return {}
# other attr
ret = {**img_dict, **text_dict, "question_id": uni_key}
return ret

from datasets import load_from_disk

class HTMLDataset(Dataset):
def __init__(self, image_processor, text_processor, args, data_dirs, cross_image_processor=None, **kwargs):
super().__init__()
self.data = self.load_data(data_dirs)
self.image_processor, self.text_processor, self.cross_image_processor = image_processor, text_processor, cross_image_processor

def process_img(self, img):
img_dict = {'vision': self.image_processor(img)}
if self.cross_image_processor:
img_dict.update({'cross': self.cross_image_processor(img)})
return img_dict

def process_text(self, answer, prompt):
return self.text_processor(answer, prompt)

def load_data(self, data_dir):
ds = load_from_disk(data_dir)
print_rank0(f"find {len(ds)} samples in all...")
return ds

def __len__(self):
return len(self.data)

def __getitem__(self, index):
data = self.data[index]
# img
try:
img = data['image'].convert('RGB')
except Exception as e:
print_rank0(e, level=logging.WARNING)
return {}
img_dict = self.process_img(img)
# text
label = data['text']
uni_key = str(len(self.data)) + str(index)
text_dict = self.process_text(label, "")
if text_dict is None:
print_rank0(f"Process text failed. Please check the max_target_length & max_source_length.\n The data is {data}, index, {index}", level=logging.WARNING)
return {}
# other attr
ret = {**img_dict, **text_dict, "question_id": uni_key}
return ret
5 changes: 0 additions & 5 deletions CogVLM/utils/utils/language.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
from sat.model.official.llama_model import LLaMAModel, rotate_half
from sat.transformer_defaults import attention_fn_default, split_tensor_along_last_dim
import torch.nn.functional as F


def base_history_to_prompt(self, query, history):
prompt = '<EOI>' + query
return prompt
Expand Down