Browse Source

feat: Enhance training pipeline and configuration management

- Updated Docker run command with improved GPU and user settings
- Modified learning rate scheduler to support epoch-based adjustments
- Refactored loss computation to return detailed loss components
- Added TensorBoard logging for individual loss components
- Implemented checkpoint saving mechanism with configurable frequency
- Updated training script to use dynamic configuration and improved error handling
NH-Rep
mckay 3 months ago
parent
commit
2f83f42e5b
  1. 3
      README.md
  2. 9
      code/conversion/learning_rate.py
  3. 23
      code/conversion/loss.py
  4. 4
      code/conversion/setup.conf
  5. 66
      code/conversion/train.py

3
README.md

@ -28,7 +28,8 @@ Then set up the environment via Docker or Anaconda.
This is the most convenient way to try _NH\_Rep_, everything is already settled down in the docker.
$ docker pull horaceguo/pytorchigr:isg
$ docker run -e TZ=Asia/Shanghai --runtime=nvidia --ipc=host --net=host -v PATH_TO_NH-REP/:/workspace -t -i horaceguo/pytorchigr:isg
$ docker run -e TZ=Asia/Shanghai --name nh-rep --gpus all --ipc=host --net=host -v ./:/workspace --user $(id -u):$(id -g) -t -i horaceguo/pytorchigr:isg
# 原来是 --runtime=nvidia
$ cd /workspace
Then you can convert the points sampled on B-Rep model in _input\_data_ to implicit representation:

9
code/conversion/learning_rate.py

@ -40,7 +40,7 @@ class LearningRateScheduler:
"weight_decay": self.weight_decay
}])
self.best_loss = float('inf')
self.patience = 10
self.patience = 20
self.decay_factor = 0.5
initial_lr = self.lr_schedules[0].get_learning_rate(0)
self.lr = initial_lr
@ -50,11 +50,15 @@ class LearningRateScheduler:
logger.error(f"Error setting up optimizer: {str(e)}")
raise
def step(self, current_loss):
def step(self, current_loss, current_epoch):
"""
更新学习率
:param current_loss: 当前验证损失
self.adjust_learning_rate 基于 epoch 进行一个整体 lr 更新
然后 基于 loss, 动态进行调整
"""
self.adjust_learning_rate(current_epoch)
'''
if current_loss < self.best_loss:
self.best_loss = current_loss
self.epochs_since_improvement = 0
@ -67,6 +71,7 @@ class LearningRateScheduler:
param_group['lr'] = self.lr
print(f"学习率更新为: {self.lr:.6f}")
self.epochs_since_improvement = 0
'''
def reset(self):
"""

23
code/conversion/loss.py

@ -175,16 +175,21 @@ class LossManager:
# 计算修正损失
correction_loss = self.correction_loss(mnfld_pnts, mnfld_pred, all_fi)
loss_details = {
"manifold": self.weights["manifold"] * manifold_loss,
"manifold_patch": manifold_loss_patch,
"normals": self.weights["normals"] * normals_loss,
"eikonal": self.weights["eikonal"] * eikonal_loss,
"offsurface": self.weights["offsurface"] * offsurface_loss,
"consistency": self.weights["consistency"] * consistency_loss,
"correction": self.weights["correction"] * correction_loss,
}
# 计算总损失
total_loss = (self.weights["manifold"] * manifold_loss + \
#self.weights["feature_manifold"] * feature_manifold_loss + \
manifold_loss_patch + \
self.weights["normals"] * normals_loss + \
self.weights["eikonal"] * eikonal_loss + \
self.weights["offsurface"] * offsurface_loss + \
self.weights["consistency"] * consistency_loss + \
self.weights["correction"] * correction_loss)
return total_loss
total_loss = sum(loss_details.values())
return total_loss, loss_details

4
code/conversion/setup.conf

@ -2,7 +2,8 @@ train{
folderprefix = ""
input_path = ../data/input_data/
fileprefix_list = [
broken_bullet_50k, # more input models can be added here
00000050_0_50k,
#broken_bullet_50k, # more input models can be added here
]
d_in = 3 # 输入数据的维度。在3D点云数据中,通常为3(x、y、z坐标)
plot_frequency = 5000 # 每5000次迭代绘制一次点云
@ -39,6 +40,7 @@ network{
sampler_type = NormalPerPoint # 采样器类型。NormalPerPoint表示每个点都使用正态分布采样
properties{
global_sigma = 1.8 # 全局sigma值。1.8表示全局sigma值为1.8
local_sigma = 0.01 # 局部sigma值。0.01表示局部sigma值为0.01
}
}
loss{

66
code/conversion/train.py

@ -22,13 +22,14 @@ from model.network import NHRepNet # 导入 NHRepNet
from model.sample import Sampler
class NHREPNet_Training:
def __init__(self, data_dir, name_prefix: str, if_baseline: bool = False, if_feature_sample: bool = False):
self.conf = ConfigFactory.parse_file('./conversion/setup.conf')
def __init__(self, name_prefix: str, conf, if_baseline: bool = False, if_feature_sample: bool = False):
self.conf = conf
self.sampler = Sampler.get_sampler(
self.conf.get_string('network.sampler.sampler_type'))(
global_sigma=self.conf.get_float('network.sampler.properties.global_sigma'),
local_sigma=self.conf.get_float('network.sampler.properties.local_sigma')
)
data_dir = self.conf.get_string('train.input_path')
self.dataset = NHREP_Dataset(data_dir, name_prefix, if_baseline, if_feature_sample)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@ -36,10 +37,13 @@ class NHREPNet_Training:
self.d_in = 3 # 输入维度,x, y, z.
self.dims_sdf = [256, 256, 256] # 隐藏层维度
self.nepochs = 15000 # 训练轮数
folder = self.conf.get_string('train.folderprefix')
self.writer = SummaryWriter(os.path.join("summary",folder, name_prefix)) # TensorBoard 记录器
# checkpoint
self.init_checkpoints()
self.nepochs = 15000 # 训练轮数
self.writer = SummaryWriter() # TensorBoard 记录器
def run_nhrepnet_training(self):
# 数据准备
@ -48,7 +52,7 @@ class NHREPNet_Training:
feature_mask_cpu = self.dataset.get_feature_mask().numpy() # 特征掩码
self.feature_mask = torch.from_numpy(feature_mask_cpu).to(self.device) # 特征掩码 # 特征掩码
self.points_batch = 16384 # 批次大小
self.compute_local_sigma()
n_branch = int(torch.max(self.feature_mask).item()) # 计算分支数量
n_batchsize = self.points_batch # 设置批次大小
@ -78,7 +82,7 @@ class NHREPNet_Training:
logger.info("开始训练")
self.model.train() # 设置模型为训练模式
for epoch in range(self.nepochs): # 开始训练循环
for epoch in tqdm(range(self.nepochs), desc="训练进度", unit="epoch"): # 开始训练循环
try:
self.train_one_epoch(epoch, patch_id, patch_id_n, n_patch_batch, n_patch_last, n_branch, n_batchsize)
except Exception as e:
@ -86,14 +90,13 @@ class NHREPNet_Training:
break
def train_one_epoch(self, epoch, patch_id, patch_id_n, n_patch_batch, n_patch_last, n_branch, n_batchsize):
logger.info(f"Epoch {epoch}/{self.nepochs} 开始")
#logger.info(f"Epoch {epoch}/{self.nepochs} 开始")
# 1.3,获取索引
indices = self.get_indices(patch_id, patch_id_n, n_patch_batch, n_patch_last, n_branch)
# 1.4,获取数据
cur_data = self.data[indices] # x, y, z, nx, ny, nz
mnfld_pnts = cur_data[:, :self.d_in] # 提取流形点
self.compute_local_sigma()
mnfld_sigma = self.local_sigma[indices] # 提取噪声点
nonmnfld_pnts = self.sampler.get_points(mnfld_pnts.unsqueeze(0), mnfld_sigma.unsqueeze(0)).squeeze() # 生成非流形点
@ -101,9 +104,6 @@ class NHREPNet_Training:
#TODO 记录了log
# 2,前向传播
self.scheduler.adjust_learning_rate(epoch)
#logger.info(f"mnfld_pnts: {mnfld_pnts.shape}")
#logger.info(f"nonmnfld_pnts: {nonmnfld_pnts.shape}")
@ -116,7 +116,7 @@ class NHREPNet_Training:
normals = cur_data[:, -self.d_in:]
# 计算损失
loss = self.loss_manager.compute_loss(
loss,loss_details = self.loss_manager.compute_loss(
mnfld_pnts = mnfld_pnts,
normals = normals,
mnfld_pred_all = mnfld_pred_all,
@ -128,7 +128,7 @@ class NHREPNet_Training:
n_patch_last = n_patch_last,
) # 计算损失
self.scheduler.step(loss)
self.scheduler.step(loss,epoch)
# 反向传播
self.scheduler.optimizer.zero_grad() # 清空梯度
@ -136,8 +136,13 @@ class NHREPNet_Training:
self.scheduler.optimizer.step() # 更新参数
avg_loss = loss.item()
logger.info(f'Epoch [{epoch}/{self.nepochs}], Average Loss: {avg_loss:.4f}')
if epoch % 100 == 0:
#logger.info(f'Epoch [{epoch}/{self.nepochs}]')
self.writer.add_scalar('Loss/train', avg_loss, epoch) # 记录损失到 TensorBoard
for k,v in loss_details.items():
self.writer.add_scalar('Loss/'+k, v.item(), epoch)
if epoch % self.conf.get_int('train.checkpoint_frequency') == 0: # 每隔一定轮次保存检查点
self.save_checkpoints(epoch)
#============================ 前向传播 数据准备 ============================
def compute_patch(self, n_branch, n_patch_batch, n_patch_last, feature_mask_cpu):
@ -202,23 +207,34 @@ class NHREPNet_Training:
#============================ 保存模型 ============================
def save_checkpoints(self, epoch):
def init_checkpoints(self):
self.checkpoints_path = os.path.join("../exps/single_shape",name_prefix, "checkpoints")
self.ModelParameters_path = os.path.join(self.checkpoints_path, "ModelParameters")
self.OptimizerParameters_path = os.path.join(self.checkpoints_path, "OptimizerParameters")
# 创建目录
os.makedirs(self.ModelParameters_path, exist_ok=True)
os.makedirs(self.OptimizerParameters_path, exist_ok=True)
def save_checkpoints(self, epoch):
torch.save(
{"epoch": epoch, "model_state_dict": self.network.state_dict()},
os.path.join(self.checkpoints_path, self.model_params_subdir, str(epoch) + ".pth"))
{"epoch": epoch, "model_state_dict": self.model.state_dict()},
os.path.join(self.ModelParameters_path, str(epoch) + ".pth"))
torch.save(
{"epoch": epoch, "model_state_dict": self.network.state_dict()},
os.path.join(self.checkpoints_path, self.model_params_subdir, "latest.pth"))
{"epoch": epoch, "model_state_dict": self.model.state_dict()},
os.path.join(self.ModelParameters_path, "latest.pth"))
torch.save(
{"epoch": epoch, "optimizer_state_dict": self.optimizer.state_dict()},
os.path.join(self.checkpoints_path, self.optimizer_params_subdir, str(epoch) + ".pth"))
{"epoch": epoch, "optimizer_state_dict": self.scheduler.optimizer.state_dict()},
os.path.join(self.OptimizerParameters_path, str(epoch) + ".pth"))
torch.save(
{"epoch": epoch, "optimizer_state_dict": self.optimizer.state_dict()},
os.path.join(self.checkpoints_path, self.optimizer_params_subdir, "latest.pth"))
{"epoch": epoch, "optimizer_state_dict": self.scheduler.optimizer.state_dict()},
os.path.join(self.OptimizerParameters_path, "latest.pth"))
if __name__ == "__main__":
data_dir = '../data/input_data' # 数据目录
name_prefix = 'broken_bullet_50k'
train = NHREPNet_Training(data_dir, name_prefix, if_baseline=True, if_feature_sample=False)
conf = ConfigFactory.parse_file('./conversion/setup.conf')
try:
train = NHREPNet_Training(name_prefix, conf, if_baseline=True, if_feature_sample=False)
train.run_nhrepnet_training()
except Exception as e:
logger.error(str(e))
Loading…
Cancel
Save