增加非零表面采样点，优化batch train

12 months ago · a3a2a2c7a1
7 changed files with 723 additions and 172 deletions
--- a/brep2sdf/IsoSurfacing.py
+++ b/brep2sdf/IsoSurfacing.py
@ -111,8 +111,8 @@ def main():
    parser = argparse.ArgumentParser(description='IsoSurface Generator')
    parser.add_argument('-i', '--input', type=str, required=True, help='Input model file (.pt)')
    parser.add_argument('-o', '--output', type=str, required=True, help='Output mesh file (.ply)')
-    parser.add_argument('--depth', type=int, default=7, help='网格深度（分辨率）')
+    parser.add_argument('--depth', type=int, default=3, help='网格深度（分辨率）')
-    parser.add_argument('--box_size', type=float, default=2.0, help='边界框大小')
+    parser.add_argument('--box_size', type=float, default=1.0, help='边界框大小')
    parser.add_argument('--method', type=str, default='MC', choices=['MC'], help='表面提取方法')
    parser.add_argument('--use-gpu', action='store_true', help='使用GPU')
    parser.add_argument('--compare', type=str, help='GT网格文件（.ply）')
--- a/brep2sdf/batch_train.py
+++ b/brep2sdf/batch_train.py
@ -1,67 +1,152 @@
 import os
 import subprocess
 import argparse
 import logging
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from tqdm import tqdm
 # 配置日志记录
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-def main():
+def run_training_process(input_step: str, train_script: str, common_args: list) -> tuple[str, bool, str, str]:
-    # 定义 STEP 文件目录和名称列表文件路径
+    """
-    step_root_dir = "/home/wch/brep2sdf/data/step"
+    为单个 STEP 文件运行 train.py 子进程。
    name_list_path = "/home/wch/brep2sdf/data/name_list.txt"
    Args:
        input_step: 输入 STEP 文件的路径。
        train_script: train.py 脚本的路径。
        common_args: 传递给 train.py 的通用参数列表。
    Returns:
        Tuple: (输入文件路径, 是否成功, stdout, stderr)
    """
    command = [
        "python", train_script,
        *common_args,
        "-i", input_step,
    ]
    try:
        result = subprocess.run(
            command,
            capture_output=True,
            text=True,
            check=True, # 如果返回非零退出码，则抛出 CalledProcessError
            timeout=600 # 添加超时设置（例如10分钟）
        )
        return input_step, True, result.stdout, result.stderr
    except subprocess.CalledProcessError as e:
        logger.error(f"命令 '{' '.join(command)}' 执行失败。")
        return input_step, False, e.stdout, e.stderr
    except subprocess.TimeoutExpired as e:
        logger.error(f"处理 '{input_step}' 超时。")
        return input_step, False, e.stdout or "", e.stderr or ""
    except Exception as e:
        logger.error(f"处理 '{input_step}' 时发生意外错误: {e}")
        return input_step, False, "", str(e)
 def main(args):
    # 读取名称列表
    try:
-        with open(name_list_path, 'r') as f:
+        with open(args.name_list_path, 'r') as f:
-            names = [line.strip() for line in f if line.strip()]  # 去除空行
+            names = [line.strip() for line in f if line.strip()]
        logger.info(f"从 '{args.name_list_path}' 读取了 {len(names)} 个名称。")
    except FileNotFoundError:
-        print(f"Error: File '{name_list_path}' not found.")
+        logger.error(f"错误: 文件 '{args.name_list_path}' 未找到。")
        return
    except Exception as e:
-        print(f"Error reading file '{name_list_path}': {e}")
+        logger.error(f"读取文件 '{args.name_list_path}' 时出错: {e}")
        return
-    # 遍历名称列表并处理每个 STEP 文件
+    # 准备 train.py 的通用参数
-    for name in tqdm(names, desc="Processing STEP files"):
+    # 注意：从命令行参数或其他配置中获取这些参数通常更好
-        step_dir = os.path.join(step_root_dir, name)
+    common_train_args = [
        "--use-normal",
        "--only-zero-surface",
        #"--force-reprocess",
        # 可以添加更多通用参数
    ]
    if args.train_args:
         common_train_args.extend(args.train_args)
-        # 动态生成 STEP 文件路径（假设只有一个文件）
+    tasks = []
-        step_files = [
+    skipped_count = 0
-            os.path.join(step_dir, f)
+    # 准备所有任务
-            for f in os.listdir(step_dir)
+    for name in names:
-            if f.endswith(".step") and f.startswith(name)
+        step_dir = os.path.join(args.step_root_dir, name)
-        ]
+        if not os.path.isdir(step_dir):
             logger.warning(f"目录 '{step_dir}' 不存在。跳过 '{name}'。")
             skipped_count += 1
             continue
-        if not step_files:
+        step_files = []
-            print(f"Warning: No STEP files found in directory '{step_dir}'. Skipping...")
+        try:
-            continue
+            step_files = [
                os.path.join(step_dir, f)
                for f in os.listdir(step_dir)
                if f.lower().endswith((".step", ".stp")) and f.startswith(name) # 支持 .stp 并忽略大小写
            ]
        except OSError as e:
             logger.warning(f"无法访问目录 '{step_dir}': {e}。跳过 '{name}'。")
             skipped_count += 1
             continue
-        # 假设我们只处理第一个匹配的文件
+        if len(step_files) == 0:
-        input_step = step_files[0]
+            logger.warning(f"在目录 '{step_dir}' 中未找到匹配的 STEP 文件。跳过 '{name}'。")
            skipped_count += 1
        elif len(step_files) > 1:
            logger.warning(f"在目录 '{step_dir}' 中找到多个匹配的 STEP 文件，将使用第一个: {step_files[0]}。")
            tasks.append(step_files[0])
        else:
            tasks.append(step_files[0])
-        # 构造子进程命令
+    if not tasks:
-        command = [
+        logger.info("没有找到需要处理的有效 STEP 文件。")
-            "python", "train.py",
+        return
            "--use-normal",
            "-i", input_step,  # 输入文件路径
        ]
-        # 调用子进程运行命令
+    logger.info(f"准备处理 {len(tasks)} 个 STEP 文件，跳过了 {skipped_count} 个名称。")
-        try:
+
-            result = subprocess.run(
+    success_count = 0
-                command,
+    failure_count = 0
-                capture_output=True,
+
-                text=True,
+    # 使用 ProcessPoolExecutor 进行并行处理
-                check=True  # 如果返回非零退出码，则抛出 CalledProcessError
+    with ProcessPoolExecutor(max_workers=args.workers) as executor:
-            )
+        # 提交所有任务
-            print(f"Processed '{input_step}' successfully.")
+        futures = {
-            print("STDOUT:", result.stdout)
+            executor.submit(run_training_process, task_path, args.train_script, common_train_args): task_path
-            print("STDERR:", result.stderr)
+            for task_path in tasks
-        except subprocess.CalledProcessError as e:
+        }
            print(f"Error processing '{input_step}': {e}")
            print("STDOUT:", e.stdout)
            print("STDERR:", e.stderr)
        except Exception as e:
            print(f"Unexpected error processing '{input_step}': {e}")
        # 使用 tqdm 显示进度并处理结果
        for future in tqdm(as_completed(futures), total=len(tasks), desc="运行训练"):
            input_path = futures[future]
            try:
                input_step, success, stdout, stderr = future.result()
                if success:
                    success_count += 1
                    # 可以选择记录成功的 stdout/stderr，但通常只记录失败的更有用
                    # logger.debug(f"成功处理 '{input_step}'. STDOUT:\n{stdout}\nSTDERR:\n{stderr}")
                else:
                    failure_count += 1
                    logger.error(f"处理 '{input_step}' 失败。STDOUT:\n{stdout}\nSTDERR:\n{stderr}")
            except Exception as e:
                failure_count += 1
                logger.error(f"处理任务 '{input_path}' 时获取结果失败: {e}")
    logger.info(f"处理完成。成功: {success_count}, 失败: {failure_count}, 跳过: {skipped_count}")
 if __name__ == '__main__':
-    main()
+    parser = argparse.ArgumentParser(description="批量运行 train.py 处理 STEP 文件。")
    parser.add_argument('--step-root-dir', type=str, default="/home/wch/brep2sdf/data/step",
                        help="包含 STEP 子目录的根目录。")
    parser.add_argument('--name-list-path', type=str, default="/home/wch/brep2sdf/data/name_list.txt",
                        help="包含要处理的名称列表的文件路径。")
    parser.add_argument('--train-script', type=str, default="train.py",
                        help="要执行的训练脚本路径。")
    parser.add_argument('--workers', type=int, default=os.cpu_count(),
                        help="用于并行处理的工作进程数。")
    parser.add_argument('--train-args', nargs='*',
                        help="传递给 train.py 的额外参数 (例如 --epochs 10 --batch-size 32)。")
    args = parser.parse_args()
    main(args)
--- a/brep2sdf/config/default_config.py
+++ b/brep2sdf/config/default_config.py
@ -27,7 +27,7 @@ class ModelConfig:
@dataclass
 class DataConfig:
    """数据相关配置"""
-    max_face: int = 80
+    max_face: int = 400
    max_edge: int = 16
    num_query_points: int = 32*32*32  # 限制查询点数量，sdf 采样点数 本来是 128*128*128 ，在data load时随机采样
    bbox_scaled: float = 1.0
@ -47,8 +47,8 @@ class TrainConfig:
    # 基本训练参数
    batch_size: int = 8
    num_workers: int = 4
-    num_epochs: int = 200
+    num_epochs: int = 1
-    learning_rate: float = 0.01
+    learning_rate: float = 0.0001
    min_lr: float = 1e-5
    weight_decay: float = 0.01
--- a/brep2sdf/data/data.py
+++ b/brep2sdf/data/data.py
@ -318,6 +318,149 @@ def load_sdf_file(sdf_path: str, num_query_points: int = 4096) -> torch.Tensor:
        logger.error(f"Error message: {str(e)}")
        raise
 def prepare_sdf_data(surf_data, normals=None, max_points=100000, device='cuda'):
    """
    准备SDF数据，合并表面点云，可选地包含法线，并进行降采样。
    Args:
        surf_data: list[np.ndarray] 每个元素是一个形状为(M, 3)的表面点云数组。
        normals: list[np.ndarray] | None 每个元素是形状为(M, 3)的法线数组，与surf_data对应。
        max_points: int 降采样后的最大点数。
        device: str | torch.device Pytorch设备。
    Returns:
        torch.Tensor: 形状为 (N, 4) 或 (N, 7) 的张量，N <= max_points。
                      列为 [x, y, z, sdf=0] 或 [x, y, z, nx, ny, nz, sdf=0]。
    """
    total_points = sum(len(s) for s in surf_data)
    has_normals = normals is not None
    # 确定输出数组的形状
    num_features = 7 if has_normals else 4
    output_size = min(total_points, max_points)
    sdf_array = np.zeros((output_size, num_features), dtype=np.float32)
    if total_points > max_points:
        # --- 执行降采样 ---
        logger.debug(f"总点数 {total_points} 超过 {max_points}，执行降采样...")
        indices = []
        for i, points in enumerate(surf_data):
            indices.extend([(i, j) for j in range(len(points))])
        np.random.shuffle(indices)
        selected_indices = indices[:max_points] # 选择前max_points个
        # 根据索引填充sdf_array
        for idx, (surf_idx, pnt_idx) in enumerate(selected_indices):
            sdf_array[idx, :3] = surf_data[surf_idx][pnt_idx]
            if has_normals:
                # 检查法线数据是否存在且有效
                if surf_idx < len(normals) and pnt_idx < len(normals[surf_idx]):
                     sdf_array[idx, 3:6] = normals[surf_idx][pnt_idx]
                else:
                     logger.warning(f"降采样时发现无效的法线索引: surf_idx={surf_idx}, pnt_idx={pnt_idx}")
                     # 可以选择填充默认值，例如 [0, 0, 1]
                     sdf_array[idx, 3:6] = np.array([0.0, 0.0, 1.0], dtype=np.float32)
    else:
        # --- 不执行降采样，使用所有点 ---
        logger.debug(f"总点数 {total_points} 未超过 {max_points}，使用所有点。")
        # 直接拼接所有点
        all_points = np.concatenate(surf_data, axis=0)
        sdf_array[:, :3] = all_points
        if has_normals:
            # 检查法线数据是否与点数据匹配
            total_normal_points = sum(len(n) for n in normals)
            if total_normal_points == total_points:
                all_normals = np.concatenate(normals, axis=0)
                sdf_array[:, 3:6] = all_normals
            else:
                 logger.error(f"点数 ({total_points}) 与法线点数 ({total_normal_points}) 不匹配！")
                 # 处理不匹配的情况，例如只填充坐标，或者抛出错误
                 # 这里选择只填充坐标，并将法线部分保留为0
                 sdf_array = sdf_array[:, :4] # 退化为只有坐标和SDF
                 sdf_array[:, -1] = 0.0 # 确保SDF为0
                 # 或者可以填充默认法线
                 # sdf_array[:, 3:6] = np.tile(np.array([0.0, 0.0, 1.0]), (total_points, 1))
    # 注意：表面点的SDF值通常设为0
    sdf_array[:, -1] = 0.0
    return torch.tensor(sdf_array, dtype=torch.float32, device=device)
 def print_data_distribution(data: torch.Tensor) -> None:
    """打印数据分布统计信息
    Args:
        data: 形状为 (N, 7) 的张量 [x, y, z, nx, ny, nz, sdf] 或 (N, 4) 的张量 [x, y, z, sdf]
    """
    # 检查数据维度
    n_features = data.shape[1]
    has_normals = n_features == 7
    # 统计坐标信息
    coords = data[:, :3]
    logger.debug("坐标分布统计:")
    logger.debug(f"  范围: min={coords.min(dim=0)[0]}, max={coords.max(dim=0)[0]}")
    logger.debug(f"  均值: mean={coords.mean(dim=0)}")
    logger.debug(f"  标准差: std={coords.std(dim=0)}")
    # 如果有法向量，统计法向量信息
    if has_normals:
        normals = data[:, 3:6]
        normal_lengths = torch.norm(normals, dim=1)
        logger.debug("\n法向量分布统计:")
        logger.debug(f"  范围: min={normals.min(dim=0)[0]}, max={normals.max(dim=0)[0]}")
        logger.debug(f"  均值: mean={normals.mean(dim=0)}")
        logger.debug(f"  标准差: std={normals.std(dim=0)}")
        logger.debug(f"  法向量长度: mean={normal_lengths.mean():.4f}, std={normal_lengths.std():.4f}")
    # 统计SDF值信息
    sdf = data[:, -1]
    logger.debug("\nSDF值分布统计:")
    logger.debug(f"  范围: min={sdf.min():.4f}, max={sdf.max():.4f}")
    logger.debug(f"  均值: mean={sdf.mean():.4f}")
    logger.debug(f"  标准差: std={sdf.std():.4f}")
    logger.debug(f"  零值附近(|sdf|<1e-4)的点数量: {torch.sum(torch.abs(sdf) < 1e-4)}")
 # --- 添加一个辅助函数用于检查 ---
 def check_tensor(tensor: torch.Tensor | None, name: str, epoch: int, step: int = -1) -> bool:
    """检查张量是否包含 inf 或 nan"""
    prefix = f"Epoch {epoch}" + (f" Step {step}" if step >= 0 else "")
    if tensor is None:
        # 对于可选的张量（如 normals），None 是有效的，但对于其他张量可能是问题
        # logger.warning(f"{prefix}: Tensor '{name}' is None.")
        return False # 返回 False 表示没有检测到 inf/nan (但要注意 None 本身)
    if not isinstance(tensor, torch.Tensor):
        logger.info(f"{prefix}: '{name}' is not a Tensor, but {type(tensor)}.")
        return True # 类型错误，视为问题
    has_inf = torch.isinf(tensor).any()
    has_nan = torch.isnan(tensor).any()
    if has_inf:
        logger.info(f"{prefix}: !!! Infinity detected in '{name}' !!!")
        # 可以选择性地打印更多信息
        # inf_indices = torch.where(torch.isinf(tensor))
        # logger.error(f"Inf indices: {inf_indices}")
        # logger.error(f"Inf values sample: {tensor[inf_indices][:5]}")
    if has_nan:
        logger.info(f"{prefix}: !!! NaN detected in '{name}' !!!")
        # nan_indices = torch.where(torch.isnan(tensor))
        # logger.error(f"NaN indices: {nan_indices}")
    return has_inf or has_nan
 # --- 辅助函数结束 ---
 def test_dataset():
    """测试数据集功能"""
    try:
--- a/brep2sdf/data/pre_process_by_mesh.py
+++ b/brep2sdf/data/pre_process_by_mesh.py
@ -18,6 +18,8 @@ from scipy.spatial import cKDTree
 from brep2sdf.utils.logger import logger
 import tempfile
 import trimesh
 from trimesh.proximity import ProximityQuery
 # 导入OpenCASCADE相关库
 from OCC.Core.STEPControl import STEPControl_Reader  # STEP文件读取器
 from OCC.Core.TopExp import TopExp_Explorer, topexp  # 拓扑结构遍历
@ -167,7 +169,7 @@ def get_bbox(shape, subshape):
-def parse_solid(step_path,sample_normal_vector=False):
+def parse_solid(step_path,sample_normal_vector=False,sample_sdf_points=False):
    """
    解析STEP文件中的CAD模型数据
@ -260,7 +262,7 @@ def parse_solid(step_path,sample_normal_vector=False):
    while edge_explorer.More():
        edge = topods.Edge(edge_explorer.Current())
        edges.append(edge)
-        logger.debug(len(edges))
+        #logger.debug(len(edges))
        curve_info = BRep_Tool.Curve(edge)
        if curve_info is None:
            continue  # 跳过无效边
@ -378,19 +380,76 @@ def parse_solid(step_path,sample_normal_vector=False):
        }
    }
-    if sample_normal_vector:
+    trimesh_mesh = None
-         # 从 mesh 读 法向量
+    trimesh_mesh_ncs = None
-        mesh.Perform()
+
-        # 导出为STL临时文件
+    # --- Trimesh 加载和处理 (如果需要) ---
    if sample_normal_vector or sample_sdf_points:
        logger.debug("加载 Trimesh 用于法线/SDF 采样...")
        # 注意：这里的 mesh (BRepMesh_IncrementalMesh) 与 trimesh 不同
        # 需要从原始 shape 导出 STL
        stl_writer = StlAPI_Writer()
        stl_writer.SetASCIIMode(False)
-        with tempfile.NamedTemporaryFile(suffix='.stl') as tmp:
+        tmp_stl_path = ""
-            stl_writer.Write(shape, tmp.name)
+        try:
-            trimesh_mesh = trimesh.load(tmp.name)
+            with tempfile.NamedTemporaryFile(suffix='.stl', delete=True) as tmp:
-        data['surf_pnt_normals']= batch_compute_normals(trimesh_mesh,surfs_wcs)
+                tmp_stl_path = tmp.name
                # 检查 shape 是否有效
                if shape.IsNull():
                     raise ValueError("OCC Shape is Null, cannot write STL.")
                success = stl_writer.Write(shape, tmp_stl_path)
                if not success:
                     raise RuntimeError(f"StlAPI_Writer failed to write {tmp_stl_path}")
                trimesh_mesh = trimesh.load(tmp_stl_path)
            # 创建归一化 Trimesh
            vertices_wcs = trimesh_mesh.vertices.astype(np.float32)
            vertices_ncs = (vertices_wcs - data['normalization_params']['center']) / data['normalization_params']['scale']
            trimesh_mesh_ncs = trimesh.Trimesh(vertices=vertices_ncs, faces=trimesh_mesh.faces, process=False)
            if not trimesh_mesh_ncs.is_watertight:
                logger.debug(f"{step_path} 的归一化网格不是 watertight，尝试修复。")
                trimesh.repair.fill_holes(trimesh_mesh_ncs)
                if not trimesh_mesh_ncs.is_watertight:
                    logger.warning(f"{step_path} 的归一化网格修复后仍不是 watertight。")
        except Exception as e:
            logger.error(f"为 {step_path} 加载/处理 Trimesh 失败: {e}")
            trimesh_mesh = None
            trimesh_mesh_ncs = None
    # --- 计算表面点法线 ---
    if sample_normal_vector and trimesh_mesh_ncs is not None:
        logger.debug("计算表面点法线...")
        # 使用 data['surf_ncs'] 因为它们已经是归一化后的点云
        if data['surf_ncs'].shape[0] > 0:
             # 确保 batch_compute_normals 使用归一化的 mesh
             data['surf_pnt_normals'] = batch_compute_normals(trimesh_mesh_ncs, data['surf_ncs'])
        else:
             logger.warning("没有有效的归一化表面点云用于法线计算。")
             data['surf_pnt_normals'] = np.array([], dtype=object)
    elif sample_normal_vector:
         logger.warning("请求了表面法线计算，但 Trimesh 加载失败。")
         data['surf_pnt_normals'] = np.array([], dtype=object) # 添加空键
    # --- SDF 点采样 ---
    data['sampled_points_normals_sdf'] = None # 初始化键
    if sample_sdf_points:
        if trimesh_mesh_ncs is not None:
            # 调用封装的函数，传递固定数量参数
            data['sampled_points_normals_sdf'] = sample_sdf_points_and_normals(
                trimesh_mesh_ncs=trimesh_mesh_ncs,
                surf_bbox_ncs=data['surf_bbox_ncs'],
                num_sdf_samples=4096, # <-- 传递固定数量
                sdf_sampling_std_dev=0.0001
            )
        else:
            logger.warning("请求了 SDF 点采样，但 Trimesh 加载失败。")
    return data
 def load_step(step_path):
    """Load STEP file and return solids"""
    reader = STEPControl_Reader()
@ -474,6 +533,206 @@ def batch_compute_normals(mesh, surf_wcs, normal_type='vertex', k_neighbors=3):
    return normals_output
 def sample_sdf_points_and_normals(
    trimesh_mesh_ncs: trimesh.Trimesh,
    surf_bbox_ncs: np.ndarray,
    num_sdf_samples: int = 4096,
    sdf_sampling_std_dev: float = 0.01
 ) -> np.ndarray | None:
    """
    在归一化坐标系(NCS)下采样固定数量的点，并计算它们的SDF值和最近表面法线。
    采用均匀采样和近表面采样的混合策略。
    参数:
        trimesh_mesh_ncs: 归一化的 Trimesh 对象。
        surf_bbox_ncs:    归一化坐标系下各面的包围盒 [num_faces, 6]。
        num_sdf_samples:  要采样的总点数。
        sdf_sampling_std_dev: 近表面采样时沿法线扰动的标准差。
    返回:
        np.ndarray | None: 形状为 (N, 7) 的数组 [x, y, z, nx, ny, nz, sdf]，
                         如果采样或计算失败则返回 None。
    """
    logger.debug("为 SDF 计算采样点 (固定数量策略)...")
    if trimesh_mesh_ncs is None or not isinstance(trimesh_mesh_ncs, trimesh.Trimesh):
        logger.error("无效的 Trimesh 对象提供给 SDF 采样。")
        return None
    if num_sdf_samples <= 0:
        logger.warning("请求的 SDF 采样点数为零或负数，不进行采样。")
        return None
    # 使用固定的采样边界 [-0.5, 0.5]，因为我们已经做过归一化
    min_bound_ncs = np.array([-0.5, -0.5, -0.5], dtype=np.float32)
    max_bound_ncs = np.array([0.5, 0.5, 0.5], dtype=np.float32)
    bbox_size_ncs = max_bound_ncs - min_bound_ncs
    # --- 使用固定的总样本数分配点数 ---
    num_uniform_samples = num_sdf_samples // 2
    num_near_surface_samples = num_sdf_samples - num_uniform_samples
    logger.debug(f"固定 SDF 采样点数: {num_sdf_samples} (均匀: {num_uniform_samples}, 近表面: {num_near_surface_samples})")
    # --- 执行采样 ---
    sampled_points_list = []
    # 均匀采样 (在 [-0.5, 0.5] 范围内)
    if num_uniform_samples > 0:
        uniform_points = np.random.uniform(-0.5, 0.5, (num_uniform_samples, 3))
        sampled_points_list.append(uniform_points)
    # 近表面采样
    if num_near_surface_samples > 0:
        if trimesh_mesh_ncs.faces.shape[0] > 0:
            try:
                near_points_on_surface = trimesh_mesh_ncs.sample(num_near_surface_samples)
                proximity_query_near = ProximityQuery(trimesh_mesh_ncs)
                closest_points_near, distances_near, face_indices_near = proximity_query_near.on_surface(near_points_on_surface)
                if np.any(face_indices_near >= len(trimesh_mesh_ncs.face_normals)):
                    raise IndexError("Face index out of bounds during near-surface normal lookup")
                normals_near = trimesh_mesh_ncs.face_normals[face_indices_near]
                perturbations = np.random.randn(num_near_surface_samples, 1) * sdf_sampling_std_dev
                near_points = near_points_on_surface + normals_near * perturbations
                # 确保近表面点也在 [-0.5, 0.5] 范围内
                near_points = np.clip(near_points, -0.5, 0.5)
                sampled_points_list.append(near_points)
            except Exception as e:
                logger.warning(f"近表面采样失败: {e}。将替换为均匀采样点。")
                fallback_uniform = np.random.uniform(-0.5, 0.5, (num_near_surface_samples, 3))
                sampled_points_list.append(fallback_uniform)
        else:
            logger.warning("网格没有面，无法执行近表面采样。将替换为均匀采样点。")
            fallback_uniform = np.random.uniform(-0.5, 0.5, (num_near_surface_samples, 3))
            sampled_points_list.append(fallback_uniform)
    # --- 合并采样点 ---
    if not sampled_points_list:
        logger.warning("没有为SDF采样到任何点。")
        return None
    sampled_points_ncs = np.vstack(sampled_points_list).astype(np.float32)
    try:
        proximity_query = ProximityQuery(trimesh_mesh_ncs)
        # 分批计算SDF以避免内存问题
        batch_size = 1000
        sdf_values = []
        closest_points = []
        face_indices = []
        for i in range(0, len(sampled_points_ncs), batch_size):
            batch_points = sampled_points_ncs[i:i + batch_size]
            # 计算当前批次的最近点和面
            batch_closest, batch_distances, batch_faces = proximity_query.on_surface(batch_points)
            # 计算点到最近面的向量
            direction_vectors = batch_points - batch_closest
            # 使用batch_compute_normals计算最近点的法向量
            # 将batch_closest重新组织为所需的格式 (N,) 数组，每个元素是 (M, 3) 数组
            closest_points_reshaped = np.array([batch_closest], dtype=object)
            closest_points_reshaped[0] = batch_closest
            # 计算法向量
            normals_batch = batch_compute_normals(
                trimesh_mesh_ncs,
                closest_points_reshaped,
                normal_type='vertex',  # 使用顶点法向量
                k_neighbors=3
            )[0]  # 取第一个元素因为我们只传入了一个批次
            # 计算方向向量与法向量的点积
            dot_products = np.sum(direction_vectors * normals_batch, axis=1)
            signs = np.sign(dot_products)
            # 确保零点处的符号处理
            zero_mask = np.abs(batch_distances) < 1e-6
            signs[zero_mask] = 0.0
            # 计算带符号距离
            batch_sdf = batch_distances * signs
            # 限制SDF值的范围
            batch_sdf = np.clip(batch_sdf, -1.414, 1.414)  # sqrt(2)
            # 添加调试信息
            if i == 0:  # 只打印第一个批次的统计信息
                logger.debug(f"批次统计 (首批次):")
                logger.debug(f"  法向量范围: [{normals_batch.min():.4f}, {normals_batch.max():.4f}]")
                logger.debug(f"  法向量长度: {np.linalg.norm(normals_batch, axis=1).mean():.4f}")
                logger.debug(f"  距离范围: [{batch_distances.min():.4f}, {batch_distances.max():.4f}]")
                logger.debug(f"  点积范围: [{dot_products.min():.4f}, {dot_products.max():.4f}]")
                logger.debug(f"  符号分布: 正={np.sum(signs > 0)}, 负={np.sum(signs < 0)}, 零={np.sum(signs == 0)}")
                logger.debug(f"  SDF范围: [{batch_sdf.min():.4f}, {batch_sdf.max():.4f}]")
            sdf_values.append(batch_sdf)
            closest_points.append(batch_closest)
            face_indices.append(batch_faces)
        # 合并批次结果
        sdf_values = np.concatenate(sdf_values)
        closest_points = np.concatenate(closest_points)
        # 为所有点计算法向量
        all_points_reshaped = np.array([closest_points], dtype=object)
        all_points_reshaped[0] = closest_points
        sampled_normals = batch_compute_normals(
            trimesh_mesh_ncs,
            all_points_reshaped,
            normal_type='vertex',
            k_neighbors=3
        )[0]
        # 验证法向量
        normal_lengths = np.linalg.norm(sampled_normals, axis=1)
        logger.debug(f"最终法向量统计:")
        logger.debug(f"  形状: {sampled_normals.shape}")
        logger.debug(f"  长度: min={normal_lengths.min():.4f}, max={normal_lengths.max():.4f}, mean={normal_lengths.mean():.4f}")
        logger.debug(f"  分量范围: x=[{sampled_normals[:,0].min():.4f}, {sampled_normals[:,0].max():.4f}]")
        logger.debug(f"  分量范围: y=[{sampled_normals[:,1].min():.4f}, {sampled_normals[:,1].max():.4f}]")
        logger.debug(f"  分量范围: z=[{sampled_normals[:,2].min():.4f}, {sampled_normals[:,2].max():.4f}]")
        # 添加验证
        valid_mask = (
            ~np.isnan(sdf_values) & ~np.isinf(sdf_values) &
            ~np.isnan(sampled_normals).any(axis=1) & ~np.isinf(sampled_normals).any(axis=1) &
            ~np.isnan(sampled_points_ncs).any(axis=1) & ~np.isinf(sampled_points_ncs).any(axis=1)
        )
        if not np.all(valid_mask):
            num_invalid = sampled_points_ncs.shape[0] - np.sum(valid_mask)
            logger.warning(f"在 SDF/法线计算中发现 {num_invalid} 个无效条目。将它们过滤掉。")
            sampled_points_ncs = sampled_points_ncs[valid_mask]
            sampled_normals = sampled_normals[valid_mask]
            sdf_values = sdf_values[valid_mask]
        if sampled_points_ncs.shape[0] > 0:
            combined_data = np.hstack((
                sampled_points_ncs,
                sampled_normals,
                sdf_values[:, np.newaxis]
            )).astype(np.float32)
            # 添加SDF分布验证
            final_sdf = combined_data[:, -1]
            logger.debug(f"最终SDF分布验证:")
            logger.debug(f"  正值点数: {np.sum(final_sdf > 0)}")
            logger.debug(f"  负值点数: {np.sum(final_sdf < 0)}")
            logger.debug(f"  零值点数: {np.sum(np.abs(final_sdf) < 1e-6)}")
            logger.debug(f"  SDF范围: [{final_sdf.min():.4f}, {final_sdf.max():.4f}]")
            # 验证分布是否合理
            if np.sum(final_sdf > 0) == 0 or np.sum(final_sdf < 0) == 0:
                logger.warning("警告：SDF值分布异常，没有正值或负值！")
            return combined_data
        else:
            logger.warning("过滤 SDF/法线结果后没有剩余有效点。")
            return None
    except Exception as e:
        logger.error(f"计算 SDF 或法线时失败: {str(e)}")
        return None
 def check_data_format(data, step_file):
    """检查数据格式是否正确"""
    required_keys = [
@ -513,7 +772,7 @@ def check_data_format(data, step_file):
    return True, ""
-def process_single_step(step_path:str, output_path:str=None, sample_normal_vector=False, timeout:int=300) -> dict:
+def process_single_step(step_path:str, output_path:str=None, sample_normal_vector=False, sample_sdf_points=False, timeout:int=300) -> dict:
    """处理单个STEP文件, 从 brep 2 pkl
    return data = {
        'surf_wcs': np.array(surfs_wcs, dtype=object),    # 世界坐标系下的曲面几何数据(对象数组)
@ -537,7 +796,7 @@ def process_single_step(step_path:str, output_path:str=None, sample_normal_vecto
            logger.error(f"文件格式不支持，必须是.step或.stp文件: {step_path}")
            return None
        # 解析STEP文件
-        data = parse_solid(step_path, sample_normal_vector)
+        data = parse_solid(step_path, sample_normal_vector,sample_sdf_points)
        if data is None:
            logger.error(f"Failed to parse STEP file: {step_path}")
            return None
@ -551,12 +810,11 @@ def process_single_step(step_path:str, output_path:str=None, sample_normal_vecto
        # 保存结果
        if output_path:
            try:
-                logger.info(f"Saving results to: {output_path}")
+                logger.debug(f"Saving results to: {output_path}")
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                with open(output_path, 'wb') as f:
                    pickle.dump(data, f)
-                logger.info("数据预处理完成")
+                logger.debug(f"Results saved successfully: {output_path}")
                logger.info(f"Results saved successfully: {output_path}")
                return data
            except Exception as e:
                logger.error(f'Failed to save {output_path}: {str(e)}')
@ -588,19 +846,19 @@ def test(step_file_path, output_path=None):
            return None
        # 打印统计信息
-        logger.info("\nStatistics:")
+        logger.debug("\nStatistics:")
-        logger.info(f"Number of surfaces: {len(data['surf_wcs'])}")
+        logger.debug(f"Number of surfaces: {len(data['surf_wcs'])}")
-        logger.info(f"Number of edges: {len(data['edge_wcs'])}")
+        logger.debug(f"Number of edges: {len(data['edge_wcs'])}")
-        logger.info(f"Number of corners: {len(data['corner_wcs'])}")  # 修正为corner_wcs
+        logger.debug(f"Number of corners: {len(data['corner_wcs'])}")  # 修正为corner_wcs
        # 保存结果
        if output_path:
            try:
-                logger.info(f"Saving results to: {output_path}")
+                logger.debug(f"Saving results to: {output_path}")
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                with open(output_path, 'wb') as f:
                    pickle.dump(data, f)
-                logger.info(f"Results saved successfully: {output_path}")
+                logger.debug(f"Results saved successfully: {output_path}")
            except Exception as e:
                logger.error(f"Failed to save {output_path}: {str(e)}")
                return None
--- a/brep2sdf/networks/encoder.py
+++ b/brep2sdf/networks/encoder.py
@ -81,7 +81,6 @@ class Encoder(nn.Module):
    def _init_parameters(self):
        """为所有叶子节点初始化特征参数"""
        # 使用栈模拟递归遍历（避免递归）
        stack = [(self.octree, "root")]  # (当前节点, 当前路径)
        param_index = 0  # 参数索引计数器
--- a/brep2sdf/train.py
+++ b/brep2sdf/train.py
@ -7,7 +7,7 @@ import numpy as np
 import argparse
 from brep2sdf.config.default_config import get_default_config
-from brep2sdf.data.data import load_brep_file,load_sdf_file 
+from brep2sdf.data.data import load_brep_file,load_sdf_file, prepare_sdf_data, print_data_distribution, check_tensor
 from brep2sdf.data.pre_process_by_mesh import process_single_step
 from brep2sdf.networks.network import Net
 from brep2sdf.networks.octree import OctreeNode
@ -24,6 +24,11 @@ parser.add_argument(
    action='store_true',  # 默认为 False，如果用户指定该参数，则为 True
    help='强制采样点有法向量'
 )
 parser.add_argument(
    '--only-zero-surface', 
    action='store_true',  # 默认为 False，如果用户指定该参数，则为 True
    help='只采样零表面点 SDF 训练'
 )
 parser.add_argument(
    '--force-reprocess', 
    action='store_true',  # 默认为 False，如果用户指定该参数，则为 True
@ -32,43 +37,6 @@ parser.add_argument(
 args = parser.parse_args()
 def prepare_sdf_data(surf_data, normals=None, max_points=100000, device='cuda'):
    total_points = sum(len(s) for s in surf_data)
    # 降采样逻辑（修复版）
    if total_points > max_points:
        # 生成索引
        indices = []
        for i, points in enumerate(surf_data):
            indices.extend([(i, j) for j in range(len(points))])
        # 随机打乱索引
        np.random.shuffle(indices)
        # 选择前max_points个索引
        selected_indices = indices[:max_points]
        if not normals is None:
            # 根据索引构建sdf_array
            sdf_array = np.zeros((max_points, 4), dtype=np.float32)
            for idx, (i, j) in enumerate(selected_indices):
                sdf_array[idx, :3] = surf_data[i][j]
        else:
            sdf_array = np.zeros((max_points, 7), dtype=np.float32)
            for idx, (i, j) in enumerate(selected_indices):
                sdf_array[idx, :3] = surf_data[i][j]
                sdf_array[idx, 3:6] = normals[i][j]
    else:
        if not normals is  None:
            sdf_array = np.zeros((total_points, 4), dtype=np.float32)
            sdf_array[:, :3] = np.concatenate(surf_data)
            sdf_array = np.zeros((max_points, 7), dtype=np.float32)
        else:
            for idx, (i, j) in enumerate(selected_indices):
                sdf_array[idx, :3] = surf_data[i][j]
                sdf_array[idx, 3:6] = normals[i][j]
    return torch.tensor(sdf_array, dtype=torch.float32, device=device)
 class Trainer:
    def __init__(self, config, input_step):
@ -85,18 +53,33 @@ class Trainer:
                logger.error(f"fail to load {data_path}, {str(e)}")
                raise e
            if args.use_normal and self.data.get("surf_pnt_normals", None) is  None:
-                self.data = process_single_step(step_path=input_step, output_path=data_path, sample_normal_vector=args.use_normal)
+                self.data = process_single_step(step_path=input_step, output_path=data_path, sample_normal_vector=args.use_normal,sample_sdf_points=not args.only_zero_surface)
        else:
-            self.data = process_single_step(step_path=input_step, output_path=data_path, sample_normal_vector=args.use_normal)
+            self.data = process_single_step(step_path=input_step, output_path=data_path, sample_normal_vector=args.use_normal,sample_sdf_points=not args.only_zero_surface)
        # 将曲面点云列表转换为 (N*M, 4) 数组
        surfs = self.data["surf_ncs"]
-        self.sdf_data = prepare_sdf_data(
+        
        # 准备表面点的SDF数据
        surface_sdf_data = prepare_sdf_data(
            surfs, 
-            normals = self.data["surf_pnt_normals"],
+            normals=self.data["surf_pnt_normals"],
            max_points=4096,
            device=self.device
        )
        # 如果不是仅使用零表面，则合并采样点数据
        if not args.only_zero_surface:
            # 加载采样点数据
            sampled_sdf_data = torch.tensor(
                self.data['sampled_points_normals_sdf'], 
                dtype=torch.float32, 
                device=self.device
            )
            # 合并表面点数据和采样点数据
            self.sdf_data = torch.cat([surface_sdf_data, sampled_sdf_data], dim=0)
        else:
            self.sdf_data = surface_sdf_data
        print_data_distribution(self.sdf_data)
        # 初始化数据集
        #self.brep_data = load_brep_file(self.config.data.pkl_path)
        #logger.info( self.brep_data )
@ -146,71 +129,154 @@ class Trainer:
    def _calculate_global_bbox(self, surf_bbox: torch.Tensor) -> torch.Tensor:
        """
-        计算整个数据集的全局边界框，综合考虑表面包围盒和采样点
+        返回一个固定的全局边界框（单位立方体）。
        参数:
-            surf_bbox: 形状为 (num_edges, 6) 的Tensor，表示每条边的包围盒
+            surf_bbox: (此参数在此实现中未使用)
                          [xmin, ymin, zmin, xmax, ymax, zmax]
        返回:
-            形状为 (6,) 的Tensor，格式为 [x_min, y_min, z_min, x_max, y_max, z_max]
+            形状为 (6,) 的Tensor，表示固定的边界框 [-0.5, -0.5, -0.5, 0.5, 0.5, 0.5]
        """
-        # 验证输入
+        # 直接定义固定的单位立方体边界框
-        if not isinstance(surf_bbox, torch.Tensor):
+        # 注意：确保张量在正确的设备上创建
-            raise TypeError(f"surf_bbox 必须是 torch.Tensor，但得到 {type(surf_bbox)}")
+        fixed_bbox = torch.tensor([-0.5, -0.5, -0.5, 0.5, 0.5, 0.5],
-        if surf_bbox.dim() != 2 or surf_bbox.shape[1] != 6:
+                                  dtype=torch.float32,
-            raise ValueError(f"surf_bbox 形状应为 (num_edges, 6)，但得到 {surf_bbox.shape}")
+                                  device=self.device) # 假设 self.device 存储了目标设备
-
+
-        # 计算表面包围盒的全局范围
+        logger.debug(f"使用固定的全局边界框: {fixed_bbox.cpu().numpy()}")
-        global_min = surf_bbox[:, :3].min(dim=0).values
+        return fixed_bbox
-        global_max = surf_bbox[:, 3:].max(dim=0).values
+
-
+        # --- 旧的计算逻辑 (注释掉或删除) ---
-
+        # # 验证输入
-        # 返回合并后的边界框
+        # if not isinstance(surf_bbox, torch.Tensor):
-        return torch.cat([global_min, global_max])
+        #     raise TypeError(f"surf_bbox 必须是 torch.Tensor，但得到 {type(surf_bbox)}")
        # if surf_bbox.dim() != 2 or surf_bbox.shape[1] != 6:
        #     raise ValueError(f"surf_bbox 形状应为 (num_edges, 6)，但得到 {surf_bbox.shape}")
        #
        # # 计算表面包围盒的全局范围
        # global_min = surf_bbox[:, :3].min(dim=0).values
        # global_max = surf_bbox[:, 3:].max(dim=0).values
        #
        # # 返回合并后的边界框
        # return torch.cat([global_min, global_max])
        # return [-0.5,] # 这个是错误的
    def train_epoch(self, epoch: int) -> float:
        self.model.train()
        total_loss = 0.0
-        
+        step = 0 # 如果你的训练是分批次的，这里应该用批次索引
-
+
-        # 获取数据并移动到设备
+        # --- 1. 检查输入数据 ---
-        points = self.sdf_data[:,0:3]
+        # 注意：假设 self.sdf_data 包含 [x, y, z, nx, ny, nz, sdf] (7列) 或 [x, y, z, sdf] (4列)
-        points.requires_grad_(True)
+        # 并且 SDF 值总是在最后一列
        if self.sdf_data is None:
             logger.error(f"Epoch {epoch}: self.sdf_data is None. Cannot train.")
             return float('inf')
        points = self.sdf_data[:, 0:3].clone().detach() # 取前3列作为点
        gt_sdf = self.sdf_data[:, -1].clone().detach()  # 取最后一列作为SDF真值
        normals = None
        if args.use_normal:
-            normals = self.sdf_data[:,3:6]
+            if self.sdf_data.shape[1] < 7: # 检查是否有足够的列给法线
-            gt_sdf = self.sdf_data[:,6]
+                logger.error(f"Epoch {epoch}: --use-normal is specified, but sdf_data has only {self.sdf_data.shape[1]} columns.")
                return float('inf')
            normals = self.sdf_data[:, 3:6].clone().detach() # 取中间3列作为法线
        # 执行检查
        if check_tensor(points, "Input Points", epoch, step): return float('inf')
        if check_tensor(gt_sdf, "Input GT SDF", epoch, step): return float('inf')
        if args.use_normal:
            # 只有在请求法线时才检查 normals
            if check_tensor(normals, "Input Normals", epoch, step): return float('inf')
        else:
            gt_sdf = self.sdf_data[:,3]
-        # 前向传播
+        # --- 准备模型输入，启用梯度 ---
        points.requires_grad_(True) # 在检查之后启用梯度
        # --- 前向传播 ---
        self.optimizer.zero_grad()
        pred_sdf = self.model(points)
-        # 计算损失
+        # --- 2. 检查模型输出 ---
-        if args.use_normal:
+        if check_tensor(pred_sdf, "Predicted SDF (Model Output)", epoch, step): return float('inf')
-            loss,loss_details = self.loss_manager.compute_loss(
+
-                points,
+        # --- 计算损失 ---
-                normals,
+        loss = torch.tensor(float('nan'), device=self.device) # 初始化为 NaN 以防计算失败
-                gt_sdf,
+        loss_details = {}
-                pred_sdf
+        try:
-            )  # 计算损失
+            # --- 3. 检查损失计算前的输入 ---
-        else:
+            # (points 已经启用梯度，不需要再次检查inf/nan，但检查pred_sdf和gt_sdf)
-            loss = torch.nn.functional.mse_loss(pred_sdf, gt_sdf)
+            if check_tensor(pred_sdf, "Predicted SDF (Loss Input)", epoch, step): raise ValueError("Bad pred_sdf before loss")
            if check_tensor(gt_sdf, "GT SDF (Loss Input)", epoch, step): raise ValueError("Bad gt_sdf before loss")
            if args.use_normal:
                 # 检查法线和带梯度的点
                 if check_tensor(normals, "Normals (Loss Input)", epoch, step): raise ValueError("Bad normals before loss")
                 if check_tensor(points, "Points (Loss Input)", epoch, step): raise ValueError("Bad points before loss")
                 loss, loss_details = self.loss_manager.compute_loss(
                     points,
                     normals, # 传递检查过的 normals
                     gt_sdf,
                     pred_sdf
                 )
            else:
                loss = torch.nn.functional.mse_loss(pred_sdf, gt_sdf)
            # --- 4. 检查损失计算结果 ---
            if check_tensor(loss, "Calculated Loss", epoch, step):
                 logger.error(f"Epoch {epoch} Step {step}: Loss calculation resulted in inf/nan.")
                 if loss_details: logger.error(f"Loss Details: {loss_details}")
                 return float('inf') # 如果损失无效，停止这个epoch
        except Exception as loss_e:
            logger.error(f"Epoch {epoch} Step {step}: Error during loss calculation: {loss_e}", exc_info=True)
            return float('inf') # 如果计算出错，停止这个epoch
        # --- 反向传播和优化 ---
        try:
            loss.backward()
            # --- 5. (可选) 检查梯度 ---
            # for name, param in self.model.named_parameters():
            #     if param.grad is not None:
            #         if check_tensor(param.grad, f"Gradient/{name}", epoch, step):
            #             logger.warning(f"Epoch {epoch} Step {step}: Bad gradient for {name}. Consider clipping or zeroing.")
            #             # 例如：param.grad.data.clamp_(-1, 1) # 值裁剪
            #             # 或在 optimizer.step() 前进行范数裁剪：
            #             # torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
-        # 反向传播和优化
+            # --- (推荐) 添加梯度裁剪 ---
-        loss.backward()
+            # 防止梯度爆炸，这可能是导致 inf/nan 的原因之一
-        self.optimizer.step()
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0) # 范数裁剪
-        total_loss += loss.item()
+            self.optimizer.step()
-        # 记录训练进度
+        except Exception as backward_e:
            logger.error(f"Epoch {epoch} Step {step}: Error during backward pass or optimizer step: {backward_e}", exc_info=True)
            # 如果你想看是哪个操作导致的，可以启用 anomaly detection
            # torch.autograd.set_detect_anomaly(True) # 放在训练开始前
            return float('inf') # 如果反向传播或优化出错，停止这个epoch
        # --- 记录和累加损失 ---
        current_loss = loss.item()
        if not np.isfinite(current_loss): # 再次确认损失是有效的数值
             logger.error(f"Epoch {epoch} Step {step}: Loss item is not finite ({current_loss}).")
             return float('inf')
        total_loss += current_loss
        # 记录训练进度 (只记录有效的损失)
        logger.info(f'Train Epoch: {epoch:4d}]\t'
-                    f'Loss: {loss.item():.6f}')
+                    f'Loss: {current_loss:.6f}')
        if loss_details: logger.info(f"Loss Details: {loss_details}")
        # (如果你的训练分批次，这里应该继续循环下一批次)
        # step += 1
-        return total_loss 
+        return total_loss # 对于单批次训练，直接返回当前损失
    def validate(self, epoch: int) -> float:
        self.model.eval()
@ -259,10 +325,10 @@ class Trainer:
        # 训练完成
        total_time = time.time() - start_time
        self._tracing_model()
        logger.info(f'Training completed in {total_time//3600:.0f}h {(total_time%3600)//60:.0f}m {total_time%60:.2f}s')
        logger.info(f'Best validation loss: {best_val_loss:.6f}')
        self._tracing_model()
        #self.test_load()
    def test_load(self):