diff --git a/python/dgl/graphbolt/__init__.py b/python/dgl/graphbolt/__init__.py
index 74d48573e82e..886761b38938 100644
--- a/python/dgl/graphbolt/__init__.py
+++ b/python/dgl/graphbolt/__init__.py
@@ -54,6 +54,7 @@ def load_graphbolt():
 from .external_utils import add_reverse_edges, exclude_seed_edges
 from .internal import (
     compact_csc_format,
+    numpy_save_aligned,
     unique_and_compact,
     unique_and_compact_csc_formats,
 )
diff --git a/python/dgl/graphbolt/impl/torch_based_feature_store.py b/python/dgl/graphbolt/impl/torch_based_feature_store.py
index a03437e19d5e..79a851a488c1 100644
--- a/python/dgl/graphbolt/impl/torch_based_feature_store.py
+++ b/python/dgl/graphbolt/impl/torch_based_feature_store.py
@@ -49,7 +49,8 @@ class TorchBasedFeature(Feature):
     >>> feature.size()
     torch.Size([5])
 
-    2. The feature is on disk.
+    2. The feature is on disk. Note that you can use gb.numpy_save_aligned as a
+    replacement for np.save to potentially get increased performance.
 
     >>> import numpy as np
     >>> arr = np.array([[1, 2], [3, 4]])
@@ -237,7 +238,9 @@ def __repr__(self) -> str:
 class DiskBasedFeature(Feature):
     r"""A wrapper of disk based feature.
 
-    Initialize a disk based feature fetcher by a numpy file.
+    Initialize a disk based feature fetcher by a numpy file. Note that you can
+    use gb.numpy_save_aligned as a replacement for np.save to potentially get
+    increased performance.
 
     Parameters
     ----------
@@ -250,7 +253,7 @@ class DiskBasedFeature(Feature):
     >>> from dgl import graphbolt as gb
     >>> torch_feat = torch.arange(10).reshape(2, -1)
     >>> pth = "path/to/feat.npy"
-    >>> np.save(pth,torch_feat)
+    >>> np.save(pth, torch_feat)
     >>> feature = gb.DiskBasedFeature(pth)
     >>> feature.read(torch.tensor([0]))
     tensor([[0, 1, 2, 3, 4]])
@@ -356,7 +359,8 @@ class TorchBasedFeatureStore(BasicFeatureStore):
     For a feature store, its format must be either "pt" or "npy" for Pytorch or
     Numpy formats. If the format is "pt", the feature store must be loaded in
     memory. If the format is "npy", the feature store can be loaded in memory or
-    on disk.
+    on disk. Note that you can use gb.numpy_save_aligned as a replacement for
+    np.save to potentially get increased performance.
 
     Parameters
     ----------
diff --git a/python/dgl/graphbolt/internal/utils.py b/python/dgl/graphbolt/internal/utils.py
index c8a9dcc7d8be..614e2ac5561c 100644
--- a/python/dgl/graphbolt/internal/utils.py
+++ b/python/dgl/graphbolt/internal/utils.py
@@ -12,6 +12,21 @@
 from numpy.lib.format import read_array_header_1_0, read_array_header_2_0
 
 
+def numpy_save_aligned(*args, **kwargs):
+    """A wrapper for numpy.save(), ensures the array is stored 4KiB aligned."""
+    # https://github.com/numpy/numpy/blob/2093a6d5b933f812d15a3de0eafeeb23c61f948a/numpy/lib/format.py#L179
+    has_array_align = hasattr(np.lib.format, "ARRAY_ALIGN")
+    if has_array_align:
+        default_alignment = np.lib.format.ARRAY_ALIGN
+        # The maximum allowed alignment by the numpy code linked above is 4K.
+        # Most filesystems work with block sizes of 4K so in practice, the file
+        # size on the disk won't be larger.
+        np.lib.format.ARRAY_ALIGN = 4096
+    np.save(*args, **kwargs)
+    if has_array_align:
+        np.lib.format.ARRAY_ALIGN = default_alignment
+
+
 def _read_torch_data(path):
     return torch.load(path)
 
@@ -54,7 +69,7 @@ def save_data(data, path, fmt):
                 "so it will be copied to contiguous memory."
             )
             data = np.ascontiguousarray(data)
-        np.save(path, data)
+        numpy_save_aligned(path, data)
     elif fmt == "torch":
         if not data.is_contiguous():
             Warning(
diff --git a/tests/python/pytorch/graphbolt/internal/test_utils.py b/tests/python/pytorch/graphbolt/internal/test_utils.py
index c1274f1a9adf..831fb9470e3a 100644
--- a/tests/python/pytorch/graphbolt/internal/test_utils.py
+++ b/tests/python/pytorch/graphbolt/internal/test_utils.py
@@ -2,6 +2,9 @@
 import os
 import re
 import tempfile
+from functools import partial
+
+import dgl.graphbolt as gb
 
 import dgl.graphbolt.internal as internal
 import numpy as np
@@ -266,3 +269,18 @@ def test_check_dataset_change():
             file.write("test contents of directory changed")
 
         assert internal.check_dataset_change(test_dir, "preprocessed")
+
+
+def test_numpy_save_aligned():
+    assert_equal = partial(torch.testing.assert_close, rtol=0, atol=0)
+    a = torch.randn(1024, dtype=torch.float32)  # 4096 bytes
+    with tempfile.TemporaryDirectory() as test_dir:
+        aligned_path = os.path.join(test_dir, "aligned.npy")
+        gb.numpy_save_aligned(aligned_path, a.numpy())
+
+        nonaligned_path = os.path.join(test_dir, "nonaligned.npy")
+        np.save(nonaligned_path, a.numpy())
+
+        assert_equal(np.load(aligned_path), np.load(nonaligned_path))
+        # The size of the file should be 4K (aligned header) + 4K (tensor).
+        assert os.path.getsize(aligned_path) == 4096 * 2