gen_data_curobo/examples/mesh_dataset.py

#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.
#
import math
import os
import sys
from typing import Optional

import numpy as np
import pyrender
import torch
import torch.nn.functional as F
import trimesh
from torch.utils.data.dataset import Dataset


def fov_and_size_to_intrinsics(fov, img_size, device="cpu"):
    img_h, img_w = img_size
    fx = img_w / (2 * math.tan(math.radians(fov) / 2))
    fy = img_h / (2 * math.tan(math.radians(fov) / 2))

    intrinsics = torch.tensor(
        [[fx, 0, img_h / 2], [0, fy, img_w / 2], [0, 0, 1]],
        dtype=torch.float,
        device=device,
    )
    return intrinsics


def lookat_to_cam_pose(eyes, ats, ups=[[0, 0, 1]], device="cpu", mode="opengl"):
    if not isinstance(eyes, torch.Tensor):
        eyes = torch.tensor(eyes, device=device, dtype=torch.float32)
    if not isinstance(ats, torch.Tensor):
        ats = torch.tensor(ats, device=device, dtype=torch.float32)
    if not isinstance(ups, torch.Tensor):
        ups = torch.tensor(ups, device=device, dtype=torch.float32)

    batch_size = eyes.shape[0]

    camera_view = F.normalize(ats - eyes, dim=1)
    camera_right = F.normalize(torch.cross(camera_view, ups, dim=1), dim=1)
    camera_up = F.normalize(torch.cross(camera_right, camera_view, dim=1), dim=1)

    # rotation matrix from opencv conventions
    T = torch.zeros((batch_size, 4, 4))
    if mode == "opengl":
        T[:, :3, :] = torch.stack([camera_right, camera_up, -camera_view, eyes], dim=2)
    elif mode == "opencv":
        T[:, :3, :] = torch.stack([camera_right, -camera_up, camera_view, eyes], dim=2)
    else:
        raise ValueError(f"Unknown mode: {mode}")
    T[:, 3, 3] = 1.0
    return T.float()


def sample_sphere_points(N, radius, device="cuda"):
    latitude = (torch.rand(size=(N, 1), device=device) - 0.5) * torch.pi
    longitude = (torch.rand(size=(N, 1), device=device) - 0.5) * torch.pi * 2
    x = torch.cos(latitude) * torch.cos(longitude)
    y = torch.sin(latitude) * torch.cos(longitude)
    z = torch.sin(longitude)
    pc = torch.cat([x, y, z], dim=1) * radius
    return pc


def sample_sphere_poses(N, origin, radius, device="cuda"):
    eyes = sample_sphere_points(N, radius, device)
    if not isinstance(origin, torch.Tensor):
        origin = torch.tensor(origin).float().to(device)
    ats = origin[None, :].repeat((N, 1))
    poses_gl = lookat_to_cam_pose(eyes, ats, device=device, mode="opengl")
    poses_cv = lookat_to_cam_pose(eyes, ats, device=device, mode="opencv")
    return poses_gl, poses_cv


def compute_origin_and_radius(trimesh_scene):
    low, high = trimesh_scene.bounds
    center = high + low / 2
    low = low - center
    high = high - center
    radius = max(np.sqrt((high**2).sum()), np.sqrt((low**2).sum()))
    return center, radius


def render_batch(trimesh_mesh, camera_poses, fov, image_size):
    camera_poses = camera_poses.detach().cpu().numpy()
    mesh = pyrender.Mesh.from_trimesh(trimesh_mesh)
    scene = pyrender.Scene()
    scene.add(mesh)
    camera = pyrender.PerspectiveCamera(yfov=fov, aspectRatio=1.0)
    camera = pyrender.Node(camera=camera, matrix=np.eye(4))
    scene.add_node(camera)

    light = pyrender.SpotLight(
        color=np.ones(3),
        intensity=1.0,
        innerConeAngle=np.pi / 16.0,
        outerConeAngle=np.pi / 2.0,
    )
    light = pyrender.Node(light=light, matrix=np.eye(4))
    scene.add_node(light)
    r = pyrender.OffscreenRenderer(image_size, image_size)

    colors = []
    depths = []
    for camera_pose in camera_poses:
        scene.set_pose(camera, camera_pose)
        scene.set_pose(light, camera_pose)
        color, depth = r.render(scene)
        colors.append(color)
        depths.append(depth)

    return np.asarray(colors), np.asarray(depths)


"""
MeshDataset takes a path to a mesh as input and uses PyRender to render images of the mesh
from a sphere centered around the scene.
"""


class MeshDataset(Dataset):
    def __init__(
        self,
        mesh_file: str = None,
        n_frames: int = 10,
        image_size: float = 256,
        save_data_dir: Optional[str] = None,
        trimesh_mesh: Optional[trimesh.Trimesh] = None,
        fov_deg: int = 60,
        # visible_point: Optional[List[float]] = None,
    ) -> None:
        super().__init__()
        self.mesh_file = mesh_file
        self.n_frames = n_frames
        if trimesh_mesh is None:
            self.trimesh_mesh = trimesh.load(self.mesh_file)
        else:
            self.trimesh_mesh = trimesh_mesh
        self.image_size = image_size

        origin, radius = compute_origin_and_radius(self.trimesh_mesh)
        self.fov_deg = fov_deg
        sphere_radius = radius * 2.0
        self.camera_poses_gl, self.camera_poses_cv = sample_sphere_poses(
            n_frames, origin, sphere_radius, "cuda"
        )
        self.colors, self.depths = render_batch(
            self.trimesh_mesh,
            self.camera_poses_gl,
            fov=math.radians(self.fov_deg),
            image_size=self.image_size,
        )
        self.intrinsics = fov_and_size_to_intrinsics(
            self.fov_deg, (self.image_size, self.image_size), device="cuda"
        )

        if save_data_dir is not None:
            self.save_as_sun3d_dataset(save_data_dir)
            # sys.exit(0)

    def save_as_sun3d_dataset(self, output_dir):
        import imageio
        from transforms3d.quaternions import quat2mat

        os.makedirs(output_dir, exist_ok=True)
        K = self.intrinsics.detach().cpu().numpy().tolist()
        intrinsics_text = f"""{K[0][0]} {K[0][1]} {K[0][2]}
            {K[1][0]} {K[1][1]} {K[1][2]}
            {K[2][0]} {K[2][1]} {K[2][2]}"""
        with open(f"{output_dir}/camera-intrinsics.txt", "w") as fp:
            fp.write(intrinsics_text)

        seqdir = f"{output_dir}/seq-01"
        os.makedirs(seqdir, exist_ok=True)

        for i in range(len(self)):
            data = self[i]
            rgb = data["rgba"][:3, :, :].detach().cpu().permute(1, 2, 0).numpy()
            depth = data["depth"]
            depth = (depth * 1000).detach().cpu().numpy().astype(np.uint16)
            nvblox_pose = data["pose"]

            eigen_quat = [0.707106769, 0.707106769, 0, 0]
            sun3d_to_nvblox_T = torch.eye(4)
            sun3d_to_nvblox_T[:3, :3] = torch.tensor(quat2mat(eigen_quat))

            sun3d_pose = torch.linalg.inv(sun3d_to_nvblox_T) @ nvblox_pose
            P = sun3d_pose.detach().cpu().numpy().tolist()

            pose_text = f"""{P[0][0]} {P[0][1]} {P[0][2]} {P[0][3]}
                {P[1][0]} {P[1][1]} {P[1][2]} {P[1][3]}
                {P[2][0]} {P[2][1]} {P[2][2]} {P[2][3]}
                {P[3][0]} {P[3][1]} {P[3][2]} {P[3][3]}"""

            framename = f"frame-{str(i).zfill(6)}"
            imageio.imwrite(f"{seqdir}/{framename}.color.png", rgb)
            imageio.imwrite(f"{seqdir}/{framename}.depth.png", depth)
            with open(f"{seqdir}/{framename}.pose.txt", "w") as fp:
                fp.write(pose_text)

    def __len__(self):
        return self.n_frames

    def __getitem__(self, index):
        rgb_np = self.colors[index]
        depth_np = self.depths[index]
        a_np = (depth_np > 0).astype(np.uint8) * 255
        rgba_np = np.concatenate([rgb_np, a_np[:, :, None]], axis=2)
        pose = self.camera_poses_cv[index]
        intrinsics = self.intrinsics

        depth_np = depth_np.astype(np.float32)
        rgba = torch.from_numpy(rgba_np).permute((2, 0, 1))
        depth = torch.from_numpy(depth_np).float()

        return {"rgba": rgba, "depth": depth, "pose": pose, "intrinsics": intrinsics}