Merge branch 'devel' into weights_only

deepmodeling · Sep 22, 2024 · 35b27cb · 35b27cb
2 parents 1c0f994 + 6010c73
commit 35b27cb
Show file tree

Hide file tree

Showing 26 changed files with 555 additions and 279 deletions.
diff --git a/backend/read_env.py b/backend/read_env.py
@@ -60,6 +60,8 @@ def get_argument_from_env() -> Tuple[str, list, list, dict, str, str]:
         cmake_minimum_required_version = "3.21"
         cmake_args.append("-DUSE_ROCM_TOOLKIT:BOOL=TRUE")
         rocm_root = os.environ.get("ROCM_ROOT")
+        if not rocm_root:
+            rocm_root = os.environ.get("ROCM_PATH")
         if rocm_root:
             cmake_args.append(f"-DCMAKE_HIP_COMPILER_ROCM_ROOT:STRING={rocm_root}")
         hipcc_flags = os.environ.get("HIP_HIPCC_FLAGS")

diff --git a/deepmd/pt/__init__.py b/deepmd/pt/__init__.py
@@ -4,6 +4,11 @@
 from deepmd.pt.cxx_op import (
     ENABLE_CUSTOMIZED_OP,
 )
+from deepmd.utils.entry_point import (
+    load_entry_point,
+)
+
+load_entry_point("deepmd.pt")
 
 __all__ = [
     "ENABLE_CUSTOMIZED_OP",

diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py
@@ -604,230 +604,3 @@ def eval_typeebd(self) -> np.ndarray:
     def get_model_def_script(self) -> str:
         """Get model defination script."""
         return self.model_def_script
-
-
-# For tests only
-def eval_model(
-    model,
-    coords: Union[np.ndarray, torch.Tensor],
-    cells: Optional[Union[np.ndarray, torch.Tensor]],
-    atom_types: Union[np.ndarray, torch.Tensor, List[int]],
-    spins: Optional[Union[np.ndarray, torch.Tensor]] = None,
-    atomic: bool = False,
-    infer_batch_size: int = 2,
-    denoise: bool = False,
-):
-    model = model.to(DEVICE)
-    energy_out = []
-    atomic_energy_out = []
-    force_out = []
-    force_mag_out = []
-    virial_out = []
-    atomic_virial_out = []
-    updated_coord_out = []
-    logits_out = []
-    err_msg = (
-        f"All inputs should be the same format, "
-        f"but found {type(coords)}, {type(cells)}, {type(atom_types)} instead! "
-    )
-    return_tensor = True
-    if isinstance(coords, torch.Tensor):
-        if cells is not None:
-            assert isinstance(cells, torch.Tensor), err_msg
-        if spins is not None:
-            assert isinstance(spins, torch.Tensor), err_msg
-        assert isinstance(atom_types, torch.Tensor) or isinstance(atom_types, list)
-        atom_types = torch.tensor(atom_types, dtype=torch.long, device=DEVICE)
-    elif isinstance(coords, np.ndarray):
-        if cells is not None:
-            assert isinstance(cells, np.ndarray), err_msg
-        if spins is not None:
-            assert isinstance(spins, np.ndarray), err_msg
-        assert isinstance(atom_types, np.ndarray) or isinstance(atom_types, list)
-        atom_types = np.array(atom_types, dtype=np.int32)
-        return_tensor = False
-
-    nframes = coords.shape[0]
-    if len(atom_types.shape) == 1:
-        natoms = len(atom_types)
-        if isinstance(atom_types, torch.Tensor):
-            atom_types = torch.tile(atom_types.unsqueeze(0), [nframes, 1]).reshape(
-                nframes, -1
-            )
-        else:
-            atom_types = np.tile(atom_types, nframes).reshape(nframes, -1)
-    else:
-        natoms = len(atom_types[0])
-
-    coord_input = torch.tensor(
-        coords.reshape([-1, natoms, 3]), dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
-    )
-    spin_input = None
-    if spins is not None:
-        spin_input = torch.tensor(
-            spins.reshape([-1, natoms, 3]),
-            dtype=GLOBAL_PT_FLOAT_PRECISION,
-            device=DEVICE,
-        )
-    has_spin = getattr(model, "has_spin", False)
-    if callable(has_spin):
-        has_spin = has_spin()
-    type_input = torch.tensor(atom_types, dtype=torch.long, device=DEVICE)
-    box_input = None
-    if cells is None:
-        pbc = False
-    else:
-        pbc = True
-        box_input = torch.tensor(
-            cells.reshape([-1, 3, 3]), dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
-        )
-    num_iter = int((nframes + infer_batch_size - 1) / infer_batch_size)
-
-    for ii in range(num_iter):
-        batch_coord = coord_input[ii * infer_batch_size : (ii + 1) * infer_batch_size]
-        batch_atype = type_input[ii * infer_batch_size : (ii + 1) * infer_batch_size]
-        batch_box = None
-        batch_spin = None
-        if spin_input is not None:
-            batch_spin = spin_input[ii * infer_batch_size : (ii + 1) * infer_batch_size]
-        if pbc:
-            batch_box = box_input[ii * infer_batch_size : (ii + 1) * infer_batch_size]
-        input_dict = {
-            "coord": batch_coord,
-            "atype": batch_atype,
-            "box": batch_box,
-            "do_atomic_virial": atomic,
-        }
-        if has_spin:
-            input_dict["spin"] = batch_spin
-        batch_output = model(**input_dict)
-        if isinstance(batch_output, tuple):
-            batch_output = batch_output[0]
-        if not return_tensor:
-            if "energy" in batch_output:
-                energy_out.append(batch_output["energy"].detach().cpu().numpy())
-            if "atom_energy" in batch_output:
-                atomic_energy_out.append(
-                    batch_output["atom_energy"].detach().cpu().numpy()
-                )
-            if "force" in batch_output:
-                force_out.append(batch_output["force"].detach().cpu().numpy())
-            if "force_mag" in batch_output:
-                force_mag_out.append(batch_output["force_mag"].detach().cpu().numpy())
-            if "virial" in batch_output:
-                virial_out.append(batch_output["virial"].detach().cpu().numpy())
-            if "atom_virial" in batch_output:
-                atomic_virial_out.append(
-                    batch_output["atom_virial"].detach().cpu().numpy()
-                )
-            if "updated_coord" in batch_output:
-                updated_coord_out.append(
-                    batch_output["updated_coord"].detach().cpu().numpy()
-                )
-            if "logits" in batch_output:
-                logits_out.append(batch_output["logits"].detach().cpu().numpy())
-        else:
-            if "energy" in batch_output:
-                energy_out.append(batch_output["energy"])
-            if "atom_energy" in batch_output:
-                atomic_energy_out.append(batch_output["atom_energy"])
-            if "force" in batch_output:
-                force_out.append(batch_output["force"])
-            if "force_mag" in batch_output:
-                force_mag_out.append(batch_output["force_mag"])
-            if "virial" in batch_output:
-                virial_out.append(batch_output["virial"])
-            if "atom_virial" in batch_output:
-                atomic_virial_out.append(batch_output["atom_virial"])
-            if "updated_coord" in batch_output:
-                updated_coord_out.append(batch_output["updated_coord"])
-            if "logits" in batch_output:
-                logits_out.append(batch_output["logits"])
-    if not return_tensor:
-        energy_out = (
-            np.concatenate(energy_out) if energy_out else np.zeros([nframes, 1])  # pylint: disable=no-explicit-dtype
-        )
-        atomic_energy_out = (
-            np.concatenate(atomic_energy_out)
-            if atomic_energy_out
-            else np.zeros([nframes, natoms, 1])  # pylint: disable=no-explicit-dtype
-        )
-        force_out = (
-            np.concatenate(force_out) if force_out else np.zeros([nframes, natoms, 3])  # pylint: disable=no-explicit-dtype
-        )
-        force_mag_out = (
-            np.concatenate(force_mag_out)
-            if force_mag_out
-            else np.zeros([nframes, natoms, 3])  # pylint: disable=no-explicit-dtype
-        )
-        virial_out = (
-            np.concatenate(virial_out) if virial_out else np.zeros([nframes, 3, 3])  # pylint: disable=no-explicit-dtype
-        )
-        atomic_virial_out = (
-            np.concatenate(atomic_virial_out)
-            if atomic_virial_out
-            else np.zeros([nframes, natoms, 3, 3])  # pylint: disable=no-explicit-dtype
-        )
-        updated_coord_out = (
-            np.concatenate(updated_coord_out) if updated_coord_out else None
-        )
-        logits_out = np.concatenate(logits_out) if logits_out else None
-    else:
-        energy_out = (
-            torch.cat(energy_out)
-            if energy_out
-            else torch.zeros(
-                [nframes, 1], dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
-            )
-        )
-        atomic_energy_out = (
-            torch.cat(atomic_energy_out)
-            if atomic_energy_out
-            else torch.zeros(
-                [nframes, natoms, 1], dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
-            )
-        )
-        force_out = (
-            torch.cat(force_out)
-            if force_out
-            else torch.zeros(
-                [nframes, natoms, 3], dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
-            )
-        )
-        force_mag_out = (
-            torch.cat(force_mag_out)
-            if force_mag_out
-            else torch.zeros(
-                [nframes, natoms, 3], dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
-            )
-        )
-        virial_out = (
-            torch.cat(virial_out)
-            if virial_out
-            else torch.zeros(
-                [nframes, 3, 3], dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
-            )
-        )
-        atomic_virial_out = (
-            torch.cat(atomic_virial_out)
-            if atomic_virial_out
-            else torch.zeros(
-                [nframes, natoms, 3, 3], dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
-            )
-        )
-        updated_coord_out = torch.cat(updated_coord_out) if updated_coord_out else None
-        logits_out = torch.cat(logits_out) if logits_out else None
-    if denoise:
-        return updated_coord_out, logits_out
-    else:
-        results_dict = {
-            "energy": energy_out,
-            "force": force_out,
-            "virial": virial_out,
-        }
-        if has_spin:
-            results_dict["force_mag"] = force_mag_out
-        if atomic:
-            results_dict["atom_energy"] = atomic_energy_out
-            results_dict["atom_virial"] = atomic_virial_out
-        return results_dict
diff --git a/deepmd/pt/model/__init__.py b/deepmd/pt/model/__init__.py
@@ -1,6 +1 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from deepmd.utils.entry_point import (
-    load_entry_point,
-)
-
-load_entry_point("deepmd.pt")
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
@@ -1032,10 +1032,13 @@ def save_model(self, save_path, lr=0.0, step=0):
             if dist.is_available() and dist.is_initialized()
             else self.wrapper
         )
-        module.train_infos["lr"] = lr
+        module.train_infos["lr"] = float(lr)
         module.train_infos["step"] = step
+        optim_state_dict = deepcopy(self.optimizer.state_dict())
+        for item in optim_state_dict["param_groups"]:
+            item["lr"] = float(item["lr"])
         torch.save(
-            {"model": module.state_dict(), "optimizer": self.optimizer.state_dict()},
+            {"model": module.state_dict(), "optimizer": optim_state_dict},
             save_path,
         )
         checkpoint_dir = save_path.parent

diff --git a/deepmd/utils/path.py b/deepmd/utils/path.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import itertools
 import os
 from abc import (
     ABC,
@@ -373,7 +374,11 @@ def glob(self, pattern: str) -> List["DPPath"]:
             list of paths
         """
         # got paths starts with current path first, which is faster
-        subpaths = [ii for ii in self._keys if ii.startswith(self._name)]
+        subpaths = [
+            ii
+            for ii in itertools.chain(self._keys, self._new_keys)
+            if ii.startswith(self._name)
+        ]
         return [
             type(self)(f"{self.root_path}#{pp}", mode=self.mode)
             for pp in globfilter(subpaths, self._connect_path(pattern))

diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md
@@ -155,7 +155,8 @@ The path to the CUDA toolkit directory. CUDA 9.0 or later is supported. NVCC is
 
 **Type**: Path; **Default**: Detected automatically
 
-The path to the ROCM toolkit directory.
+The path to the ROCM toolkit directory. If `ROCM_ROOT` is not set, it will look for `ROCM_PATH`; if `ROCM_PATH` is also not set, it will be detected using `hipconfig --rocmpath`.
+
 :::
 
 :::{envvar} DP_ENABLE_TENSORFLOW

diff --git a/doc/model/dpa2.md b/doc/model/dpa2.md
@@ -6,7 +6,7 @@
 
 The DPA-2 model implementation. See https://arxiv.org/abs/2312.15492 for more details.
 
-Training example: `examples/water/dpa2/input_torch.json`.
+Training example: `examples/water/dpa2/input_torch_medium.json`, see [README](../../examples/water/dpa2/README.md) for inputs in different levels.
 
 ## Data format
 

diff --git a/examples/water/dpa2/README.md b/examples/water/dpa2/README.md
@@ -0,0 +1,15 @@
+## Inputs for DPA-2 model
+
+This directory contains the input files for training the DPA-2 model (currently supporting PyTorch backend only). Depending on your precision/efficiency requirements, we provide three different levels of model complexity:
+
+- `input_torch_small.json`: Our smallest DPA-2 model, optimized for speed.
+- `input_torch_medium.json` (Recommended): Our well-performing DPA-2 model, balancing efficiency and precision. This is a good starting point for most users.
+- `input_torch_large.json`: Our most complex model with the highest precision, suitable for very intricate data structures.
+
+For detailed differences in their configurations, please refer to the table below:
+
+| Input                     | Repformer layers | Three-body embedding in Repinit | Pair-wise attention in Repformer | Tuned sub-structures in [#4089](https://github.com/deepmodeling/deepmd-kit/pull/4089) | Description                                                                  |
+| ------------------------- | ---------------- | ------------------------------- | -------------------------------- | ------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------- |
+| `input_torch_small.json`  | 3                | ✓                               | ✗                                | ✓                                                                                     | Smallest DPA-2 model, optimized for speed.                                   |
+| `input_torch_medium.json` | 6                | ✓                               | ✓                                | ✓                                                                                     | Recommended well-performing DPA-2 model, balancing efficiency and precision. |
+| `input_torch_large.json`  | 12               | ✓                               | ✓                                | ✓                                                                                     | Most complex model with the highest precision.                               |
diff --git a/examples/water/dpa2/input_torch.json → examples/water/dpa2/input_torch_large.json b/examples/water/dpa2/input_torch.json → examples/water/dpa2/input_torch_large.json
@@ -9,16 +9,20 @@
       "type": "dpa2",
       "repinit": {
         "tebd_dim": 8,
-        "rcut": 9.0,
-        "rcut_smth": 8.0,
+        "rcut": 6.0,
+        "rcut_smth": 0.5,
         "nsel": 120,
         "neuron": [
           25,
           50,
           100
         ],
         "axis_neuron": 12,
-        "activation_function": "tanh"
+        "activation_function": "tanh",
+        "three_body_sel": 40,
+        "three_body_rcut": 4.0,
+        "three_body_rcut_smth": 3.5,
+        "use_three_body": true
       },
       "repformer": {
         "rcut": 4.0,
@@ -36,10 +40,16 @@
         "update_g1_has_conv": true,
         "update_g1_has_grrg": true,
         "update_g1_has_drrd": true,
-        "update_g1_has_attn": true,
-        "update_g2_has_g1g1": true,
+        "update_g1_has_attn": false,
+        "update_g2_has_g1g1": false,
         "update_g2_has_attn": true,
-        "attn2_has_gate": true
+        "update_style": "res_residual",
+        "update_residual": 0.01,
+        "update_residual_init": "norm",
+        "attn2_has_gate": true,
+        "use_sqrt_nnei": true,
+        "g1_out_conv": true,
+        "g1_out_mlp": true
       },
       "add_tebd_to_repinit_out": false
     },
@@ -58,7 +68,7 @@
   "learning_rate": {
     "type": "exp",
     "decay_steps": 5000,
-    "start_lr": 0.0002,
+    "start_lr": 0.001,
     "stop_lr": 3.51e-08,
     "_comment": "that's all"
   },