modelscope系统中 微调工程的forwardbackwardoptimizer调用流程
以StableDiffusion中Dreambooth微调模式为例,其训练的调用过程大致如图所示
代码过程为:
# finetune_stable_diffusion_dreambooth.py
# trainer: <modelscope.trainers.multi_modal.dreambooth_diffusion.dreambooth_diffusion_trainer.DreamboothDiffusionTrainer
trainer = build_trainer(name=Trainers.dreambooth_diffusion, default_args=kwargs)trainer.train()
从Trainer对象中,可以看到Train_loop的过程为:
# trainer.py # EpochBasedTrainerdef train_loop(self, data_loader):""" Training loop used by `EpochBasedTrainer.train()`"""self.invoke_hook(TrainerStages.before_run)self.model.train()for _ in range(self._epoch, self._max_epochs):self.invoke_hook(TrainerStages.before_train_epoch)for i, data_batch in enumerate(data_loader):if i < self.inner_iter:# inner_iter may be read out from the checkpoint file, so skip the trained iters in the epoch.continuedata_batch = to_device(data_batch, self.device)self.data_batch = data_batchself._inner_iter = iself.invoke_hook(TrainerStages.before_train_iter)self.train_step(self.model, data_batch)self.invoke_hook(TrainerStages.after_train_iter)# Value changed after the hooks are invoked, do not move them above the invoke_hook code.del self.data_batchself._iter += 1self._mode = ModeKeys.TRAINif i + 1 >= self.iters_per_epoch:break
在train_step过程中完成了模型的forward过程,
在invoke_hook中,包含了loss计算&反向&梯度计算&log打印等的过程,通过打印相关信息,可以看到其启用的钩子函数有如下:
fn_name: <bound method OptimizerHook.before_train_iter of <modelscope.trainers.hooks.optimizer.base.OptimizerHook object at 0x7f63187c7ca0>>
fn_name: <bound method Hook.before_train_iter of <modelscope.trainers.hooks.evaluation_hook.EvaluationHook object at 0x7f63187766b0>>
fn_name: <bound method Hook.before_train_iter of <modelscope.trainers.hooks.lr_scheduler_hook.LrSchedulerHook object at 0x7f6318777580>>
fn_name: <bound method Hook.before_train_iter of <modelscope.trainers.hooks.checkpoint.checkpoint_hook.CheckpointHook object at 0x7f6318776260>>
fn_name: <bound method Hook.before_train_iter of <modelscope.trainers.hooks.iter_timer_hook.IterTimerHook object at 0x7f63186f2b00>>
fn_name: <bound method Hook.before_train_iter of <modelscope.trainers.hooks.logger.text_logger_hook.TextLoggerHook object at 0x7f63186f2950>
跟到Optimer对象中,我们可以看到所有后处理相关过程
# modelscope/trainers/hooks/optimizer/base.py@HOOKS.register_module(module_name=Hooks.OptimizerHook)
class OptimizerHook(Hook):"""Optimizer hookArgs:cumulative_iters (int): interval of gradients accumulation. Default: 1grad_clip (dict): Default None. Containing keys:max_norm (float or int): max norm of the gradientsnorm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm.More details please refer to `torch.nn.utils.clip_grad.clip_grad_norm_`loss_keys (str | list): keys list of loss"""PRIORITY = Priority.ABOVE_NORMALdef __init__(self,cumulative_iters=1,grad_clip=None,loss_keys=OutputKeys.LOSS,**kwargs) -> None:if isinstance(loss_keys, str):loss_keys = [loss_keys]assert isinstance(loss_keys, (tuple, list))self.loss_keys = loss_keysself.cumulative_iters = cumulative_itersself.grad_clip = grad_clipself.processor = OptimizerProcessor()def set_processor(self, processor):self.processor = processordef before_run(self, trainer):trainer.cumulative_iters = self.cumulative_itersself.processor.initialize_optimizer(trainer)def before_train_iter(self, trainer):self.processor.before_forward(trainer)def after_train_iter(self, trainer):self.processor.backward(trainer, self.loss_keys, self.cumulative_iters,self.grad_clip)
在OptimizerProcessor对象中,看到了完整的过程
class OptimizerProcessor:def initialize_optimizer(self, trainer):"""Initialize the optimizer.This is a strategic function which can be registered by other hook's function."""trainer.optimizer.zero_grad()def before_forward(self, trainer):passdef backward(self, trainer, loss_keys, cumulative_iters, grad_clip):"""Do module backward, optimizer's step and zero_grad and clip the grads.This is a strategic function which can be registered by other hook's function.Args:trainer(`EpochBasedTrainer`): The trainer instance.loss_keys(`list`): The list of loss keys.cumulative_iters(`int`): The cumulative iters for gradients.grad_clip(`dict`): The grad clipping options."""for k in loss_keys:trainer.train_outputs[k] /= cumulative_iterstrainer.train_outputs[k].backward()if Hook.every_n_iters(trainer, cumulative_iters):if grad_clip is not None:self.clip_grads(trainer.model.parameters(), **grad_clip)trainer.optimizer.step()trainer.optimizer.zero_grad()
如果有相关需要,可以从这里去抓对应的数据。