From 2f83f42e5b876bd98dd7dd0aa2bf7cbb796c079f Mon Sep 17 00:00:00 2001
From: mckay <wchpub@163.com>
Date: Mon, 3 Mar 2025 21:53:04 +0800
Subject: [PATCH] feat: Enhance training pipeline and configuration management

- Updated Docker run command with improved GPU and user settings
- Modified learning rate scheduler to support epoch-based adjustments
- Refactored loss computation to return detailed loss components
- Added TensorBoard logging for individual loss components
- Implemented checkpoint saving mechanism with configurable frequency
- Updated training script to use dynamic configuration and improved error handling
---
 README.md                        |  3 +-
 code/conversion/learning_rate.py |  9 +++-
 code/conversion/loss.py          | 23 +++++++----
 code/conversion/setup.conf       |  4 +-
 code/conversion/train.py         | 70 ++++++++++++++++++++------------
 5 files changed, 69 insertions(+), 40 deletions(-)

diff --git a/README.md b/README.md
index f5bbf13..36c2a66 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,8 @@ Then set up the environment via Docker or Anaconda.
 This is the most convenient way to try _NH\_Rep_, everything is already settled down in the docker.
 
         $ docker pull horaceguo/pytorchigr:isg
-        $ docker run  -e TZ=Asia/Shanghai  --runtime=nvidia --ipc=host --net=host -v PATH_TO_NH-REP/:/workspace -t -i horaceguo/pytorchigr:isg
+        $ docker run  -e TZ=Asia/Shanghai --name nh-rep --gpus all --ipc=host --net=host -v ./:/workspace --user $(id -u):$(id -g) -t -i horaceguo/pytorchigr:isg
+        # 原来是 --runtime=nvidia 
         $ cd /workspace
         
 Then you can convert the points sampled on B-Rep model in _input\_data_ to implicit representation:
diff --git a/code/conversion/learning_rate.py b/code/conversion/learning_rate.py
index ea8908c..9492bb2 100644
--- a/code/conversion/learning_rate.py
+++ b/code/conversion/learning_rate.py
@@ -40,7 +40,7 @@ class LearningRateScheduler:
                 "weight_decay": self.weight_decay
             }])
             self.best_loss = float('inf')
-            self.patience = 10
+            self.patience = 20
             self.decay_factor = 0.5
             initial_lr = self.lr_schedules[0].get_learning_rate(0)
             self.lr = initial_lr
@@ -50,11 +50,15 @@ class LearningRateScheduler:
             logger.error(f"Error setting up optimizer: {str(e)}")
             raise
 
-    def step(self, current_loss):
+    def step(self, current_loss, current_epoch):
         """
         更新学习率
         :param current_loss: 当前验证损失
+        先 self.adjust_learning_rate 基于 epoch 进行一个整体 lr 更新
+        然后 基于 loss, 动态进行调整
         """
+        self.adjust_learning_rate(current_epoch)
+        '''
         if current_loss < self.best_loss:
             self.best_loss = current_loss
             self.epochs_since_improvement = 0
@@ -67,6 +71,7 @@ class LearningRateScheduler:
                 param_group['lr'] = self.lr
             print(f"学习率更新为: {self.lr:.6f}")
             self.epochs_since_improvement = 0
+        '''
 
     def reset(self):
         """
diff --git a/code/conversion/loss.py b/code/conversion/loss.py
index b48a666..29d6848 100644
--- a/code/conversion/loss.py
+++ b/code/conversion/loss.py
@@ -175,16 +175,21 @@ class LossManager:
         # 计算修正损失
         correction_loss = self.correction_loss(mnfld_pnts, mnfld_pred, all_fi)
 
+
+        loss_details = {
+            "manifold": self.weights["manifold"] * manifold_loss,
+            "manifold_patch": manifold_loss_patch,
+            "normals": self.weights["normals"] * normals_loss,
+            "eikonal": self.weights["eikonal"] * eikonal_loss,
+            "offsurface": self.weights["offsurface"] * offsurface_loss,
+            "consistency": self.weights["consistency"] * consistency_loss,
+            "correction": self.weights["correction"] * correction_loss,
+        }
+
         # 计算总损失
-        total_loss = (self.weights["manifold"] * manifold_loss + \
-            #self.weights["feature_manifold"] * feature_manifold_loss + \
-            manifold_loss_patch + \
-            self.weights["normals"] * normals_loss + \
-            self.weights["eikonal"] * eikonal_loss + \
-            self.weights["offsurface"] * offsurface_loss + \
-            self.weights["consistency"] * consistency_loss + \
-            self.weights["correction"] * correction_loss)
-        return total_loss
+        total_loss = sum(loss_details.values())
+
+        return total_loss, loss_details
 
 
 
diff --git a/code/conversion/setup.conf b/code/conversion/setup.conf
index d152668..1c931d0 100644
--- a/code/conversion/setup.conf
+++ b/code/conversion/setup.conf
@@ -2,7 +2,8 @@ train{
     folderprefix = ""
     input_path =  ../data/input_data/
     fileprefix_list = [
-broken_bullet_50k, # more input models can be added here
+        00000050_0_50k,
+        #broken_bullet_50k, # more input models can be added here
 ]
     d_in = 3 # 输入数据的维度。在3D点云数据中，通常为3（x、y、z坐标）
     plot_frequency = 5000 # 每5000次迭代绘制一次点云
@@ -39,6 +40,7 @@ network{
         sampler_type = NormalPerPoint # 采样器类型。NormalPerPoint表示每个点都使用正态分布采样
         properties{
             global_sigma = 1.8 # 全局sigma值。1.8表示全局sigma值为1.8
+            local_sigma = 0.01 # 局部sigma值。0.01表示局部sigma值为0.01
             }
         }
     loss{
diff --git a/code/conversion/train.py b/code/conversion/train.py
index 2a28498..e261604 100644
--- a/code/conversion/train.py
+++ b/code/conversion/train.py
@@ -22,13 +22,14 @@ from model.network import NHRepNet  # 导入 NHRepNet
 from model.sample import Sampler
 
 class NHREPNet_Training:
-    def __init__(self, data_dir, name_prefix: str, if_baseline: bool = False, if_feature_sample: bool = False):
-        self.conf = ConfigFactory.parse_file('./conversion/setup.conf') 
+    def __init__(self, name_prefix: str, conf, if_baseline: bool = False, if_feature_sample: bool = False):
+        self.conf = conf
         self.sampler = Sampler.get_sampler(
                 self.conf.get_string('network.sampler.sampler_type'))(
                     global_sigma=self.conf.get_float('network.sampler.properties.global_sigma'),
                     local_sigma=self.conf.get_float('network.sampler.properties.local_sigma')
                 )
+        data_dir = self.conf.get_string('train.input_path')
         self.dataset = NHREP_Dataset(data_dir, name_prefix, if_baseline, if_feature_sample)
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -36,10 +37,13 @@ class NHREPNet_Training:
         self.d_in = 3  # 输入维度，x, y, z.
         self.dims_sdf = [256, 256, 256]  # 隐藏层维度
 
+        self.nepochs = 15000  # 训练轮数
+        folder = self.conf.get_string('train.folderprefix')
+        self.writer = SummaryWriter(os.path.join("summary",folder, name_prefix))  # TensorBoard 记录器
 
+        # checkpoint
+        self.init_checkpoints()
 
-        self.nepochs = 15000  # 训练轮数
-        self.writer = SummaryWriter()  # TensorBoard 记录器
 
     def run_nhrepnet_training(self):
         # 数据准备
@@ -48,7 +52,7 @@ class NHREPNet_Training:
         feature_mask_cpu = self.dataset.get_feature_mask().numpy() # 特征掩码
         self.feature_mask = torch.from_numpy(feature_mask_cpu).to(self.device)  # 特征掩码 # 特征掩码
         self.points_batch = 16384 # 批次大小
-
+        self.compute_local_sigma()
 
         n_branch = int(torch.max(self.feature_mask).item())  # 计算分支数量
         n_batchsize = self.points_batch  # 设置批次大小
@@ -78,7 +82,7 @@ class NHREPNet_Training:
         logger.info("开始训练")
         self.model.train()  # 设置模型为训练模式
 
-        for epoch in range(self.nepochs):  # 开始训练循环
+        for epoch in tqdm(range(self.nepochs), desc="训练进度", unit="epoch"):  # 开始训练循环
             try:
                 self.train_one_epoch(epoch, patch_id, patch_id_n, n_patch_batch, n_patch_last, n_branch, n_batchsize)
             except Exception as e:
@@ -86,14 +90,13 @@ class NHREPNet_Training:
                 break
 
     def train_one_epoch(self, epoch, patch_id, patch_id_n, n_patch_batch, n_patch_last, n_branch, n_batchsize):
-        logger.info(f"Epoch {epoch}/{self.nepochs} 开始")
+        #logger.info(f"Epoch {epoch}/{self.nepochs} 开始")
         # 1.3，获取索引
         indices = self.get_indices(patch_id, patch_id_n, n_patch_batch, n_patch_last, n_branch)
 
         # 1.4，获取数据
         cur_data = self.data[indices]  # x, y, z, nx, ny, nz
         mnfld_pnts = cur_data[:, :self.d_in]  # 提取流形点
-        self.compute_local_sigma()
         mnfld_sigma = self.local_sigma[indices]  # 提取噪声点
 
         nonmnfld_pnts = self.sampler.get_points(mnfld_pnts.unsqueeze(0), mnfld_sigma.unsqueeze(0)).squeeze()  # 生成非流形点
@@ -101,9 +104,6 @@ class NHREPNet_Training:
         #TODO 记录了log
 
         # 2，前向传播
-
-        self.scheduler.adjust_learning_rate(epoch)
-
         #logger.info(f"mnfld_pnts: {mnfld_pnts.shape}")
         #logger.info(f"nonmnfld_pnts: {nonmnfld_pnts.shape}")
         
@@ -116,7 +116,7 @@ class NHREPNet_Training:
 
         normals = cur_data[:, -self.d_in:]
         # 计算损失
-        loss = self.loss_manager.compute_loss(
+        loss,loss_details = self.loss_manager.compute_loss(
             mnfld_pnts = mnfld_pnts,
             normals = normals,
             mnfld_pred_all = mnfld_pred_all,
@@ -128,7 +128,7 @@ class NHREPNet_Training:
             n_patch_last = n_patch_last,
         )  # 计算损失
 
-        self.scheduler.step(loss)
+        self.scheduler.step(loss,epoch)
 
         # 反向传播
         self.scheduler.optimizer.zero_grad()  # 清空梯度
@@ -136,8 +136,13 @@ class NHREPNet_Training:
         self.scheduler.optimizer.step()  # 更新参数
 
         avg_loss = loss.item()
-        logger.info(f'Epoch [{epoch}/{self.nepochs}], Average Loss: {avg_loss:.4f}')
-        self.writer.add_scalar('Loss/train', avg_loss, epoch)  # 记录损失到 TensorBoard
+        if epoch % 100 == 0:
+            #logger.info(f'Epoch [{epoch}/{self.nepochs}]')
+            self.writer.add_scalar('Loss/train', avg_loss, epoch)  # 记录损失到 TensorBoard
+            for k,v in loss_details.items():
+                self.writer.add_scalar('Loss/'+k, v.item(), epoch) 
+        if epoch % self.conf.get_int('train.checkpoint_frequency') == 0:  # 每隔一定轮次保存检查点
+            self.save_checkpoints(epoch)
 
 #============================ 前向传播 数据准备 ============================
     def compute_patch(self, n_branch, n_patch_batch, n_patch_last, feature_mask_cpu):
@@ -202,23 +207,34 @@ class NHREPNet_Training:
 
 
 #============================ 保存模型 ============================
-    def save_checkpoints(self, epoch):
+    def init_checkpoints(self):
+        self.checkpoints_path = os.path.join("../exps/single_shape",name_prefix, "checkpoints")
+        self.ModelParameters_path = os.path.join(self.checkpoints_path, "ModelParameters")
+        self.OptimizerParameters_path = os.path.join(self.checkpoints_path, "OptimizerParameters")
 
+        # 创建目录
+        os.makedirs(self.ModelParameters_path, exist_ok=True)
+        os.makedirs(self.OptimizerParameters_path, exist_ok=True)
+
+    def save_checkpoints(self, epoch):
         torch.save(
-            {"epoch": epoch, "model_state_dict": self.network.state_dict()},
-            os.path.join(self.checkpoints_path, self.model_params_subdir, str(epoch) + ".pth"))
+            {"epoch": epoch, "model_state_dict": self.model.state_dict()},
+            os.path.join(self.ModelParameters_path, str(epoch) + ".pth"))
         torch.save(
-            {"epoch": epoch, "model_state_dict": self.network.state_dict()},
-            os.path.join(self.checkpoints_path, self.model_params_subdir, "latest.pth"))
+            {"epoch": epoch, "model_state_dict": self.model.state_dict()},
+            os.path.join(self.ModelParameters_path, "latest.pth"))
         torch.save(
-            {"epoch": epoch, "optimizer_state_dict": self.optimizer.state_dict()},
-            os.path.join(self.checkpoints_path, self.optimizer_params_subdir, str(epoch) + ".pth"))
+            {"epoch": epoch, "optimizer_state_dict": self.scheduler.optimizer.state_dict()},
+            os.path.join(self.OptimizerParameters_path, str(epoch) + ".pth"))
         torch.save(
-            {"epoch": epoch, "optimizer_state_dict": self.optimizer.state_dict()},
-            os.path.join(self.checkpoints_path, self.optimizer_params_subdir, "latest.pth"))
+            {"epoch": epoch, "optimizer_state_dict": self.scheduler.optimizer.state_dict()},
+            os.path.join(self.OptimizerParameters_path, "latest.pth"))
 
 if __name__ == "__main__":
-    data_dir = '../data/input_data'  # 数据目录
     name_prefix = 'broken_bullet_50k'
-    train = NHREPNet_Training(data_dir, name_prefix, if_baseline=True, if_feature_sample=False)
-    train.run_nhrepnet_training()
\ No newline at end of file
+    conf = ConfigFactory.parse_file('./conversion/setup.conf') 
+    try:
+        train = NHREPNet_Training(name_prefix, conf, if_baseline=True, if_feature_sample=False)
+        train.run_nhrepnet_training()
+    except Exception as e:
+        logger.error(str(e))
\ No newline at end of file