From 2f83f42e5b876bd98dd7dd0aa2bf7cbb796c079f Mon Sep 17 00:00:00 2001 From: mckay Date: Mon, 3 Mar 2025 21:53:04 +0800 Subject: [PATCH] feat: Enhance training pipeline and configuration management - Updated Docker run command with improved GPU and user settings - Modified learning rate scheduler to support epoch-based adjustments - Refactored loss computation to return detailed loss components - Added TensorBoard logging for individual loss components - Implemented checkpoint saving mechanism with configurable frequency - Updated training script to use dynamic configuration and improved error handling --- README.md | 3 +- code/conversion/learning_rate.py | 9 +++- code/conversion/loss.py | 23 +++++++---- code/conversion/setup.conf | 4 +- code/conversion/train.py | 70 ++++++++++++++++++++------------ 5 files changed, 69 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index f5bbf13..36c2a66 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,8 @@ Then set up the environment via Docker or Anaconda. This is the most convenient way to try _NH\_Rep_, everything is already settled down in the docker. $ docker pull horaceguo/pytorchigr:isg - $ docker run -e TZ=Asia/Shanghai --runtime=nvidia --ipc=host --net=host -v PATH_TO_NH-REP/:/workspace -t -i horaceguo/pytorchigr:isg + $ docker run -e TZ=Asia/Shanghai --name nh-rep --gpus all --ipc=host --net=host -v ./:/workspace --user $(id -u):$(id -g) -t -i horaceguo/pytorchigr:isg + # 原来是 --runtime=nvidia $ cd /workspace Then you can convert the points sampled on B-Rep model in _input\_data_ to implicit representation: diff --git a/code/conversion/learning_rate.py b/code/conversion/learning_rate.py index ea8908c..9492bb2 100644 --- a/code/conversion/learning_rate.py +++ b/code/conversion/learning_rate.py @@ -40,7 +40,7 @@ class LearningRateScheduler: "weight_decay": self.weight_decay }]) self.best_loss = float('inf') - self.patience = 10 + self.patience = 20 self.decay_factor = 0.5 initial_lr = self.lr_schedules[0].get_learning_rate(0) self.lr = initial_lr @@ -50,11 +50,15 @@ class LearningRateScheduler: logger.error(f"Error setting up optimizer: {str(e)}") raise - def step(self, current_loss): + def step(self, current_loss, current_epoch): """ 更新学习率 :param current_loss: 当前验证损失 + 先 self.adjust_learning_rate 基于 epoch 进行一个整体 lr 更新 + 然后 基于 loss, 动态进行调整 """ + self.adjust_learning_rate(current_epoch) + ''' if current_loss < self.best_loss: self.best_loss = current_loss self.epochs_since_improvement = 0 @@ -67,6 +71,7 @@ class LearningRateScheduler: param_group['lr'] = self.lr print(f"学习率更新为: {self.lr:.6f}") self.epochs_since_improvement = 0 + ''' def reset(self): """ diff --git a/code/conversion/loss.py b/code/conversion/loss.py index b48a666..29d6848 100644 --- a/code/conversion/loss.py +++ b/code/conversion/loss.py @@ -175,16 +175,21 @@ class LossManager: # 计算修正损失 correction_loss = self.correction_loss(mnfld_pnts, mnfld_pred, all_fi) + + loss_details = { + "manifold": self.weights["manifold"] * manifold_loss, + "manifold_patch": manifold_loss_patch, + "normals": self.weights["normals"] * normals_loss, + "eikonal": self.weights["eikonal"] * eikonal_loss, + "offsurface": self.weights["offsurface"] * offsurface_loss, + "consistency": self.weights["consistency"] * consistency_loss, + "correction": self.weights["correction"] * correction_loss, + } + # 计算总损失 - total_loss = (self.weights["manifold"] * manifold_loss + \ - #self.weights["feature_manifold"] * feature_manifold_loss + \ - manifold_loss_patch + \ - self.weights["normals"] * normals_loss + \ - self.weights["eikonal"] * eikonal_loss + \ - self.weights["offsurface"] * offsurface_loss + \ - self.weights["consistency"] * consistency_loss + \ - self.weights["correction"] * correction_loss) - return total_loss + total_loss = sum(loss_details.values()) + + return total_loss, loss_details diff --git a/code/conversion/setup.conf b/code/conversion/setup.conf index d152668..1c931d0 100644 --- a/code/conversion/setup.conf +++ b/code/conversion/setup.conf @@ -2,7 +2,8 @@ train{ folderprefix = "" input_path = ../data/input_data/ fileprefix_list = [ -broken_bullet_50k, # more input models can be added here + 00000050_0_50k, + #broken_bullet_50k, # more input models can be added here ] d_in = 3 # 输入数据的维度。在3D点云数据中,通常为3(x、y、z坐标) plot_frequency = 5000 # 每5000次迭代绘制一次点云 @@ -39,6 +40,7 @@ network{ sampler_type = NormalPerPoint # 采样器类型。NormalPerPoint表示每个点都使用正态分布采样 properties{ global_sigma = 1.8 # 全局sigma值。1.8表示全局sigma值为1.8 + local_sigma = 0.01 # 局部sigma值。0.01表示局部sigma值为0.01 } } loss{ diff --git a/code/conversion/train.py b/code/conversion/train.py index 2a28498..e261604 100644 --- a/code/conversion/train.py +++ b/code/conversion/train.py @@ -22,13 +22,14 @@ from model.network import NHRepNet # 导入 NHRepNet from model.sample import Sampler class NHREPNet_Training: - def __init__(self, data_dir, name_prefix: str, if_baseline: bool = False, if_feature_sample: bool = False): - self.conf = ConfigFactory.parse_file('./conversion/setup.conf') + def __init__(self, name_prefix: str, conf, if_baseline: bool = False, if_feature_sample: bool = False): + self.conf = conf self.sampler = Sampler.get_sampler( self.conf.get_string('network.sampler.sampler_type'))( global_sigma=self.conf.get_float('network.sampler.properties.global_sigma'), local_sigma=self.conf.get_float('network.sampler.properties.local_sigma') ) + data_dir = self.conf.get_string('train.input_path') self.dataset = NHREP_Dataset(data_dir, name_prefix, if_baseline, if_feature_sample) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -36,10 +37,13 @@ class NHREPNet_Training: self.d_in = 3 # 输入维度,x, y, z. self.dims_sdf = [256, 256, 256] # 隐藏层维度 + self.nepochs = 15000 # 训练轮数 + folder = self.conf.get_string('train.folderprefix') + self.writer = SummaryWriter(os.path.join("summary",folder, name_prefix)) # TensorBoard 记录器 + # checkpoint + self.init_checkpoints() - self.nepochs = 15000 # 训练轮数 - self.writer = SummaryWriter() # TensorBoard 记录器 def run_nhrepnet_training(self): # 数据准备 @@ -48,7 +52,7 @@ class NHREPNet_Training: feature_mask_cpu = self.dataset.get_feature_mask().numpy() # 特征掩码 self.feature_mask = torch.from_numpy(feature_mask_cpu).to(self.device) # 特征掩码 # 特征掩码 self.points_batch = 16384 # 批次大小 - + self.compute_local_sigma() n_branch = int(torch.max(self.feature_mask).item()) # 计算分支数量 n_batchsize = self.points_batch # 设置批次大小 @@ -78,7 +82,7 @@ class NHREPNet_Training: logger.info("开始训练") self.model.train() # 设置模型为训练模式 - for epoch in range(self.nepochs): # 开始训练循环 + for epoch in tqdm(range(self.nepochs), desc="训练进度", unit="epoch"): # 开始训练循环 try: self.train_one_epoch(epoch, patch_id, patch_id_n, n_patch_batch, n_patch_last, n_branch, n_batchsize) except Exception as e: @@ -86,14 +90,13 @@ class NHREPNet_Training: break def train_one_epoch(self, epoch, patch_id, patch_id_n, n_patch_batch, n_patch_last, n_branch, n_batchsize): - logger.info(f"Epoch {epoch}/{self.nepochs} 开始") + #logger.info(f"Epoch {epoch}/{self.nepochs} 开始") # 1.3,获取索引 indices = self.get_indices(patch_id, patch_id_n, n_patch_batch, n_patch_last, n_branch) # 1.4,获取数据 cur_data = self.data[indices] # x, y, z, nx, ny, nz mnfld_pnts = cur_data[:, :self.d_in] # 提取流形点 - self.compute_local_sigma() mnfld_sigma = self.local_sigma[indices] # 提取噪声点 nonmnfld_pnts = self.sampler.get_points(mnfld_pnts.unsqueeze(0), mnfld_sigma.unsqueeze(0)).squeeze() # 生成非流形点 @@ -101,9 +104,6 @@ class NHREPNet_Training: #TODO 记录了log # 2,前向传播 - - self.scheduler.adjust_learning_rate(epoch) - #logger.info(f"mnfld_pnts: {mnfld_pnts.shape}") #logger.info(f"nonmnfld_pnts: {nonmnfld_pnts.shape}") @@ -116,7 +116,7 @@ class NHREPNet_Training: normals = cur_data[:, -self.d_in:] # 计算损失 - loss = self.loss_manager.compute_loss( + loss,loss_details = self.loss_manager.compute_loss( mnfld_pnts = mnfld_pnts, normals = normals, mnfld_pred_all = mnfld_pred_all, @@ -128,7 +128,7 @@ class NHREPNet_Training: n_patch_last = n_patch_last, ) # 计算损失 - self.scheduler.step(loss) + self.scheduler.step(loss,epoch) # 反向传播 self.scheduler.optimizer.zero_grad() # 清空梯度 @@ -136,8 +136,13 @@ class NHREPNet_Training: self.scheduler.optimizer.step() # 更新参数 avg_loss = loss.item() - logger.info(f'Epoch [{epoch}/{self.nepochs}], Average Loss: {avg_loss:.4f}') - self.writer.add_scalar('Loss/train', avg_loss, epoch) # 记录损失到 TensorBoard + if epoch % 100 == 0: + #logger.info(f'Epoch [{epoch}/{self.nepochs}]') + self.writer.add_scalar('Loss/train', avg_loss, epoch) # 记录损失到 TensorBoard + for k,v in loss_details.items(): + self.writer.add_scalar('Loss/'+k, v.item(), epoch) + if epoch % self.conf.get_int('train.checkpoint_frequency') == 0: # 每隔一定轮次保存检查点 + self.save_checkpoints(epoch) #============================ 前向传播 数据准备 ============================ def compute_patch(self, n_branch, n_patch_batch, n_patch_last, feature_mask_cpu): @@ -202,23 +207,34 @@ class NHREPNet_Training: #============================ 保存模型 ============================ - def save_checkpoints(self, epoch): + def init_checkpoints(self): + self.checkpoints_path = os.path.join("../exps/single_shape",name_prefix, "checkpoints") + self.ModelParameters_path = os.path.join(self.checkpoints_path, "ModelParameters") + self.OptimizerParameters_path = os.path.join(self.checkpoints_path, "OptimizerParameters") + # 创建目录 + os.makedirs(self.ModelParameters_path, exist_ok=True) + os.makedirs(self.OptimizerParameters_path, exist_ok=True) + + def save_checkpoints(self, epoch): torch.save( - {"epoch": epoch, "model_state_dict": self.network.state_dict()}, - os.path.join(self.checkpoints_path, self.model_params_subdir, str(epoch) + ".pth")) + {"epoch": epoch, "model_state_dict": self.model.state_dict()}, + os.path.join(self.ModelParameters_path, str(epoch) + ".pth")) torch.save( - {"epoch": epoch, "model_state_dict": self.network.state_dict()}, - os.path.join(self.checkpoints_path, self.model_params_subdir, "latest.pth")) + {"epoch": epoch, "model_state_dict": self.model.state_dict()}, + os.path.join(self.ModelParameters_path, "latest.pth")) torch.save( - {"epoch": epoch, "optimizer_state_dict": self.optimizer.state_dict()}, - os.path.join(self.checkpoints_path, self.optimizer_params_subdir, str(epoch) + ".pth")) + {"epoch": epoch, "optimizer_state_dict": self.scheduler.optimizer.state_dict()}, + os.path.join(self.OptimizerParameters_path, str(epoch) + ".pth")) torch.save( - {"epoch": epoch, "optimizer_state_dict": self.optimizer.state_dict()}, - os.path.join(self.checkpoints_path, self.optimizer_params_subdir, "latest.pth")) + {"epoch": epoch, "optimizer_state_dict": self.scheduler.optimizer.state_dict()}, + os.path.join(self.OptimizerParameters_path, "latest.pth")) if __name__ == "__main__": - data_dir = '../data/input_data' # 数据目录 name_prefix = 'broken_bullet_50k' - train = NHREPNet_Training(data_dir, name_prefix, if_baseline=True, if_feature_sample=False) - train.run_nhrepnet_training() \ No newline at end of file + conf = ConfigFactory.parse_file('./conversion/setup.conf') + try: + train = NHREPNet_Training(name_prefix, conf, if_baseline=True, if_feature_sample=False) + train.run_nhrepnet_training() + except Exception as e: + logger.error(str(e)) \ No newline at end of file