From c5e10fd1721bd1799878fea35fbc87de0767de43 Mon Sep 17 00:00:00 2001
From: BowenYao18 <by18@rice.edu>
Date: Sat, 17 Aug 2024 12:05:09 +0000
Subject: [PATCH 01/10] add small igb-homo datasets

---
 examples/graphbolt/download.py            | 577 ++++++++++++++++++++++
 examples/graphbolt/node_classification.py |  13 +-
 2 files changed, 589 insertions(+), 1 deletion(-)
 create mode 100644 examples/graphbolt/download.py

diff --git a/examples/graphbolt/download.py b/examples/graphbolt/download.py
new file mode 100644
index 000000000000..35da6fec58a0
--- /dev/null
+++ b/examples/graphbolt/download.py
@@ -0,0 +1,577 @@
+import argparse, hashlib, os, shutil, tarfile, yaml
+import subprocess
+import urllib.request as ur
+
+import dgl.graphbolt as gb
+import numpy as np
+from tqdm import tqdm
+
+GBFACTOR = 1 << 30
+
+
+def build_yaml_helper(path, dataset_size, in_memory=True):
+    """The stirng to build the yaml file. (Still need modification)"""
+
+    data = {
+        "dataset_name": os.path.basename(path),
+        "feature_data": [
+            {
+                "domain": "node",
+                "format": "numpy",
+                "in_memory": in_memory,
+                "name": "feat",
+                "path": "data/paper_feat.npy",
+            }
+        ],
+        "graph": {
+            "edges": [
+                {"format": "numpy", "path": "edges/paper__cites__paper.npy"}
+            ],
+            "nodes": [{"num": num_nodes[dataset_size]["paper"]}],
+        },
+        "tasks": [
+            {
+                "name": "node_classification",
+                "num_classes": 19,
+                "test_set": [
+                    {
+                        "data": [
+                            {
+                                "format": "numpy",
+                                "in_memory": in_memory,
+                                "name": "seeds",
+                                "path": "set/test_indices_19.npy",
+                            },
+                            {
+                                "format": "numpy",
+                                "in_memory": in_memory,
+                                "name": "labels",
+                                "path": "set/test_labels_19.npy",
+                            },
+                        ]
+                    }
+                ],
+                "train_set": [
+                    {
+                        "data": [
+                            {
+                                "format": "numpy",
+                                "in_memory": in_memory,
+                                "name": "seeds",
+                                "path": "set/train_indices_19.npy",
+                            },
+                            {
+                                "format": "numpy",
+                                "in_memory": in_memory,
+                                "name": "labels",
+                                "path": "set/train_labels_19.npy",
+                            },
+                        ]
+                    }
+                ],
+                "validation_set": [
+                    {
+                        "data": [
+                            {
+                                "format": "numpy",
+                                "in_memory": in_memory,
+                                "name": "seeds",
+                                "path": "set/validation_indices_19.npy",
+                            },
+                            {
+                                "format": "numpy",
+                                "in_memory": in_memory,
+                                "name": "labels",
+                                "path": "set/validation_labels_19.npy",
+                            },
+                        ]
+                    }
+                ],
+            },
+            {
+                "name": "node_classification_2K",
+                "num_classes": 2983,
+                "test_set": [
+                    {
+                        "data": [
+                            {
+                                "format": "numpy",
+                                "in_memory": in_memory,
+                                "name": "seeds",
+                                "path": "set/test_indices_2983.npy",
+                            },
+                            {
+                                "format": "numpy",
+                                "in_memory": in_memory,
+                                "name": "labels",
+                                "path": "set/test_labels_2983.npy",
+                            },
+                        ]
+                    }
+                ],
+                "train_set": [
+                    {
+                        "data": [
+                            {
+                                "format": "numpy",
+                                "in_memory": in_memory,
+                                "name": "seeds",
+                                "path": "set/train_indices_2983.npy",
+                            },
+                            {
+                                "format": "numpy",
+                                "in_memory": in_memory,
+                                "name": "labels",
+                                "path": "set/train_labels_2983.npy",
+                            },
+                        ]
+                    }
+                ],
+                "validation_set": [
+                    {
+                        "data": [
+                            {
+                                "format": "numpy",
+                                "in_memory": in_memory,
+                                "name": "seeds",
+                                "path": "set/validation_indices_2983.npy",
+                            },
+                            {
+                                "format": "numpy",
+                                "in_memory": in_memory,
+                                "name": "labels",
+                                "path": "set/validation_labels_2983.npy",
+                            },
+                        ]
+                    }
+                ],
+            },
+        ],
+    }
+
+    return data
+
+
+def build_yaml(original_path, current_path, dataset_size):
+    """This build the yaml file differently based on the dataset size.
+    The two large datasets are put in disk while the other three smaller versions are in memory.
+    """
+    if "large" == dataset_size or "full" == dataset_size:
+        data = build_yaml_helper(
+            path=original_path, dataset_size=dataset_size, in_memory=False
+        )
+    else:
+        data = build_yaml_helper(path=original_path, dataset_size=dataset_size)
+    with open(f"{current_path}/metadata.yaml", "w") as file:
+        yaml.dump(data=data, stream=file, default_flow_style=False)
+
+
+dataset_urls = {
+    "homogeneous": {
+        "tiny": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_tiny.tar.gz",
+        "small": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_small.tar.gz",
+        "medium": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_medium.tar.gz",
+    },
+    "heterogeneous": {
+        "tiny": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_tiny.tar.gz",
+        "small": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_small.tar.gz",
+        "medium": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_medium.tar.gz",
+    },
+}
+
+
+md5checksums = {
+    "homogeneous": {
+        "tiny": "34856534da55419b316d620e2d5b21be",
+        "small": "6781c699723529902ace0a95cafe6fe4",
+        "medium": "4640df4ceee46851fd18c0a44ddcc622",
+    },
+    "heterogeneous": {
+        "tiny": "83fbc1091497ff92cf20afe82fae0ade",
+        "small": "2f42077be60a074aec24f7c60089e1bd",
+        "medium": "7f0df4296eca36553ff3a6a63abbd347",
+    },
+}
+
+
+def decide_download(url):
+    """An interactive command line to confirm download."""
+    d = ur.urlopen(url)
+    size = int(d.info()["Content-Length"]) / GBFACTOR
+    ### confirm if larger than 1GB
+    if size > 1:
+        return (
+            input(
+                "This will download %.2fGB. Will you proceed? (y/N) " % (size)
+            ).lower()
+            == "y"
+        )
+    else:
+        return True
+
+
+def check_md5sum(dataset_type, dataset_size, filename):
+    """This is for checking the data correctness of the downloaded datasets."""
+    original_md5 = md5checksums[dataset_type][dataset_size]
+
+    with open(filename, "rb") as file_to_check:
+        data = file_to_check.read()
+        md5_returned = hashlib.md5(data).hexdigest()
+
+    if original_md5 == md5_returned:
+        print(" md5sum verified.")
+        return
+    else:
+        os.remove(filename)
+        raise Exception(" md5sum verification failed!.")
+
+
+def download_dataset(path, dataset_type, dataset_size):
+    """This is the script to download all the related datasets."""
+
+    # For large datasets, use the two shell scripts to download.
+    if dataset_size in ["large", "full"]:
+        command = f"./download_{dataset_size}_igbh.sh"
+        subprocess.run(["bash", command], check=True, text=True)
+        shutil.move(src=f"igb-{dataset_type}-{dataset_size}", dst=f"{path}")
+        return path + "/" + "igb-" + dataset_type + "-" + dataset_size
+    # For the three smaller version, use the url to download.
+    else:
+        output_directory = path
+        if not os.path.exists(
+            output_directory
+            + "igb_"
+            + dataset_type
+            + "_"
+            + dataset_size
+            + ".tar.gz"
+        ):
+            url = dataset_urls[dataset_type][dataset_size]
+            if decide_download(url):
+                data = ur.urlopen(url)
+                size = int(data.info()["Content-Length"])
+                chunk_size = 1024 * 1024
+                num_iter = int(size / chunk_size) + 2
+                downloaded_size = 0
+                filename = (
+                    path
+                    + "/igb_"
+                    + dataset_type
+                    + "_"
+                    + dataset_size
+                    + ".tar.gz"
+                )
+                with open(filename, "wb") as f:
+                    pbar = tqdm(range(num_iter))
+                    for i in pbar:
+                        chunk = data.read(chunk_size)
+                        downloaded_size += len(chunk)
+                        pbar.set_description(
+                            "Downloaded {:.2f} GB".format(
+                                float(downloaded_size) / GBFACTOR
+                            )
+                        )
+                        f.write(chunk)
+            print(
+                "Downloaded" + " igb_" + dataset_type + "_" + dataset_size,
+                end=" ->",
+            )
+            check_md5sum(dataset_type, dataset_size, filename)
+        else:  # No need to download the tar file again if it is already downloaded.
+            print(
+                "The file igb_"
+                + dataset_type
+                + "_"
+                + dataset_size
+                + ".tar.gz already exists, directly extracting..."
+            )
+            filename = (
+                path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz"
+            )
+        # Extract the tar file
+        file = tarfile.open(filename)
+        file.extractall(output_directory)
+        file.close()
+        size = 0
+        for path, dirs, files in os.walk(output_directory + "/" + dataset_size):
+            for f in files:
+                fp = os.path.join(path, f)
+                size += os.path.getsize(fp)
+        print("Final dataset size {:.2f} GB.".format(size / GBFACTOR))
+        # os.remove(filename)
+        os.rename(
+            output_directory + "/" + dataset_size,
+            output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size,
+        )
+        return (
+            output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size
+        )
+
+
+num_nodes = {
+    "full": {
+        "paper": 269346174,
+    },
+    "large": {
+        "paper": 100000000,
+    },
+    "medium": {
+        "paper": 10000000,
+    },
+    "small": {
+        "paper": 1000000,
+    },
+    "tiny": {
+        "paper": 100000,
+    },
+}
+
+num_edges = {
+    "full": {
+        "paper__cites__paper": 3996442004,
+    },
+    "large": {
+        "paper__cites__paper": 1223571364,
+    },
+    "medium": {
+        "paper__cites__paper": 120077694,
+    },
+    "small": {
+        "paper__cites__paper": 12070502,
+    },
+    "tiny": {
+        "paper__cites__paper": 447416,
+    },
+}
+
+
+def split_data(label_path, set_dir, dataset_size, class_num):
+    """This is for splitting the labels into three sets: train, validation, and test sets."""
+    # labels = np.memmap(label_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
+    labels = np.load(label_path)
+
+    total_samples = len(labels)
+    train_end = int(0.6 * total_samples)
+    validation_end = int(0.8 * total_samples)
+
+    indices = np.arange(total_samples)
+    train_indices = indices[:train_end]
+    validation_indices = indices[train_end:validation_end]
+    test_indices = indices[validation_end:]
+    print(indices)
+    print(train_indices)
+    print(validation_indices)
+    print(test_indices)
+
+    train_labels = labels[:train_end]
+    validation_labels = labels[train_end:validation_end]
+    test_labels = labels[validation_end:]
+    print(train_labels, len(train_labels))
+    print(validation_labels, len(validation_labels))
+    print(test_labels, len(test_labels))
+
+    gb.numpy_save_aligned(
+        f"{set_dir}/train_indices_{class_num}.npy", train_indices
+    )
+    gb.numpy_save_aligned(
+        f"{set_dir}/validation_indices_{class_num}.npy", validation_indices
+    )
+    gb.numpy_save_aligned(
+        f"{set_dir}/test_indices_{class_num}.npy", test_indices
+    )
+    gb.numpy_save_aligned(
+        f"{set_dir}/train_labels_{class_num}.npy", train_labels
+    )
+    gb.numpy_save_aligned(
+        f"{set_dir}/validation_labels_{class_num}.npy", validation_labels
+    )
+    gb.numpy_save_aligned(f"{set_dir}/test_labels_{class_num}.npy", test_labels)
+
+
+def add_edges(edges, source, dest, dataset_size):
+    """This is for processing the edges in the graph and convert them to correct shape."""
+    for edge in edges:
+        print(f"\t Processing {edge} edge...")
+
+        old_edge_path = source + "/" + edge + "/" + "edge_index.npy"
+        new_edge_path = dest + "/" + edge + ".npy"
+        os.rename(src=old_edge_path, dst=new_edge_path)
+
+        # edge_array = np.memmap(new_edge_path, dtype='int32', mode='r',  shape=(num_edges[dataset_size][edge], 2))
+        edge_array = np.load(new_edge_path)
+        new_edge_array = edge_array.transpose()
+
+        assert new_edge_array.shape == (2, num_edges[dataset_size][edge])
+        assert np.array_equal(edge_array, new_edge_array.transpose())
+
+        gb.numpy_save_aligned(new_edge_path, new_edge_array)
+
+
+def process_feat(file_path, node_name, dataset_size):
+    """This is for processing the node features."""
+    # array = np.memmap(file_path, dtype='float32', mode='r',  shape=(num_nodes[dataset_size][node_name], 1024))
+    array = np.load(file_path)
+    assert array.shape == (num_nodes[dataset_size][node_name], 1024)
+    gb.numpy_save_aligned(file_path, array)
+
+    # Assert the shape and elements of the array are correct
+    # new_array = np.memmap(file_path, dtype='float32', mode='r',  shape=(num_nodes[dataset_size][node_name], 1024))
+    new_array = np.load(file_path)
+    assert new_array.shape == (num_nodes[dataset_size][node_name], 1024)
+    assert np.array_equal(array, new_array)
+
+
+def process_label(file_path, num_class, dataset_size):
+    """This is for processing the node labels."""
+    if (
+        num_class == 2983 and dataset_size == "full"
+    ):  # only this case label number changes
+        # array = np.memmap(file_path, dtype='int32', mode='r',  shape=(227130858, 1))
+        array = np.load(file_path)
+        assert array.shape[0] == 227130858
+    else:
+        # array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
+        array = np.load(file_path)
+        assert array.shape[0] == num_nodes[dataset_size]["paper"]
+
+    gb.numpy_save_aligned(file_path, array)
+
+    # Assert the shape and elements of the array are correct
+    if num_class == 2983 and dataset_size == "full":
+        # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(227130858, 1))
+        new_array = np.load(file_path)
+        assert new_array.shape[0] == 227130858
+        assert np.array_equal(array, new_array)
+    else:
+        # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
+        new_array = np.load(file_path)
+        assert new_array.shape[0] == num_nodes[dataset_size]["paper"]
+        assert np.array_equal(array, new_array)
+
+
+def add_nodes(nodes, source, dest, dataset_size):
+    """This is for processing the nodes in the graph and store them in correct format."""
+    for node in nodes:
+        print(f"\t Processing {node} node feature...")
+        old_node_path = source + "/" + node + "/" + "node_feat.npy"
+        new_node_path = dest + "/" + node + "_feat.npy"
+        os.rename(src=old_node_path, dst=new_node_path)
+        process_feat(
+            file_path=new_node_path, node_name=node, dataset_size=dataset_size
+        )
+        # If the node is a paper type, process the labels
+        if node == "paper":
+            print(f"\t Processing {node} labels...")
+            old_label_path_19 = source + "/" + node + "/" + "node_label_19.npy"
+            new_label_path_19 = dest + "/" + "paper_label_19.npy"
+            os.rename(src=old_label_path_19, dst=new_label_path_19)
+            process_label(
+                file_path=new_label_path_19,
+                num_class=19,
+                dataset_size=dataset_size,
+            )
+
+            old_label_path_2K = source + "/" + node + "/" + "node_label_2K.npy"
+            new_label_path_2K = dest + "/" + "paper_label_2K.npy"
+            os.rename(src=old_label_path_2K, dst=new_label_path_2K)
+            process_label(
+                file_path=new_label_path_2K,
+                num_class=2983,
+                dataset_size=dataset_size,
+            )
+
+    return new_label_path_19, new_label_path_2K
+
+
+def process_dataset(path, dataset_size):
+    print(f"Starting to process the {dataset_size} dataset...")
+
+    # Step 0: Make the directory for processed dataset
+    processed_dir = path + "-seeds"
+    os.makedirs(name=processed_dir, exist_ok=True)
+    original_path = path + "/" + "processed"
+
+    # Step 1: Move Nodes files
+    print("Processing Node files...")
+    node_dir = processed_dir + "/" + "data"
+    os.makedirs(name=node_dir, exist_ok=True)
+    # These are the one node in this homogeneous citation network
+    nodes = ["paper"]
+    label_file_19, label_file_2K = add_nodes(
+        nodes=nodes,
+        source=original_path,
+        dest=node_dir,
+        dataset_size=dataset_size,
+    )
+
+    # Step 2: Create labels
+    print("Processing train/valid/test files...")
+    set_dir = processed_dir + "/" + "set"
+    os.makedirs(name=set_dir, exist_ok=True)
+    split_data(
+        label_path=label_file_19,
+        set_dir=set_dir,
+        dataset_size=dataset_size,
+        class_num=19,
+    )
+    split_data(
+        label_path=label_file_2K,
+        set_dir=set_dir,
+        dataset_size=dataset_size,
+        class_num=2983,
+    )
+
+    # Step 3: Move edge files
+    print("Processing Edge files...")
+    edge_dir = processed_dir + "/" + "edges"
+    os.makedirs(name=edge_dir, exist_ok=True)
+    # These are the one edge in this homogeneous citation network
+    edges = [
+        "paper__cites__paper",
+    ]
+    add_edges(
+        edges=edges,
+        source=original_path,
+        dest=edge_dir,
+        dataset_size=dataset_size,
+    )
+
+    # Step 4: Build the yaml file
+    print("Building yaml file...")
+    build_yaml(
+        original_path=path,
+        current_path=processed_dir,
+        dataset_size=dataset_size,
+    )
+
+    # shutil.rmtree(path)
+    print(f"Finished processing the {dataset_size} dataset")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--path",
+        type=str,
+        default="datasets/",
+        help="path to store the datasets",
+    )
+    parser.add_argument(
+        "--type",
+        type=str,
+        default="homogeneous",
+        choices=["homogeneous", "heterogeneous"],
+        help="dataset type",
+    )
+    parser.add_argument(
+        "--size",
+        type=str,
+        default="tiny",
+        choices=["tiny", "small", "medium"],
+        help="size of the datasets",
+    )
+    args = parser.parse_args()
+    path = download_dataset(
+        path=args.path, dataset_type=args.type, dataset_size=args.size
+    )
+    process_dataset(path=path, dataset_size=args.size)
diff --git a/examples/graphbolt/node_classification.py b/examples/graphbolt/node_classification.py
index a4a8be298d2c..51da47e46d32 100644
--- a/examples/graphbolt/node_classification.py
+++ b/examples/graphbolt/node_classification.py
@@ -37,6 +37,7 @@
 │
 └───> All nodes set inference & Test set evaluation
 """
+
 import argparse
 import time
 
@@ -309,6 +310,9 @@ def train(args, graph, features, train_set, valid_set, num_classes, model):
             # in the last layer's computation graph.
             y = data.labels
 
+            if y.dtype != "int64":
+                y = y.long()
+
             y_hat = model(data.blocks, x)
 
             # Compute loss.
@@ -363,7 +367,14 @@ def parse_args():
         "--dataset",
         type=str,
         default="ogbn-products",
-        choices=["ogbn-arxiv", "ogbn-products", "ogbn-papers100M"],
+        choices=[
+            "ogbn-arxiv",
+            "ogbn-products",
+            "ogbn-papers100M",
+            "igb-homogeneous-tiny",
+            "igb-homogeneous-small",
+            "igb-homogeneous-medium",
+        ],
         help="The dataset we can use for node classification example. Currently"
         " ogbn-products, ogbn-arxiv, ogbn-papers100M datasets are supported.",
     )

From 418d06ca88c716a871d0d4680b619c50831b0ff1 Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Sat, 17 Aug 2024 11:41:05 -0700
Subject: [PATCH 02/10] Update examples/graphbolt/node_classification.py

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 examples/graphbolt/node_classification.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/graphbolt/node_classification.py b/examples/graphbolt/node_classification.py
index 51da47e46d32..bb871215259d 100644
--- a/examples/graphbolt/node_classification.py
+++ b/examples/graphbolt/node_classification.py
@@ -376,7 +376,8 @@ def parse_args():
             "igb-homogeneous-medium",
         ],
         help="The dataset we can use for node classification example. Currently"
-        " ogbn-products, ogbn-arxiv, ogbn-papers100M datasets are supported.",
+        " ogbn-products, ogbn-arxiv, ogbn-papers100M and"
+        " igb-hom-[tiny|small|medium] datasets are supported.",
     )
     parser.add_argument(
         "--mode",

From 3d43ab8274a2c83448242ca15744dcceb3509915 Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Sat, 17 Aug 2024 11:41:24 -0700
Subject: [PATCH 03/10] Update examples/graphbolt/node_classification.py

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 examples/graphbolt/node_classification.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/graphbolt/node_classification.py b/examples/graphbolt/node_classification.py
index bb871215259d..459b8a711af2 100644
--- a/examples/graphbolt/node_classification.py
+++ b/examples/graphbolt/node_classification.py
@@ -371,9 +371,9 @@ def parse_args():
             "ogbn-arxiv",
             "ogbn-products",
             "ogbn-papers100M",
-            "igb-homogeneous-tiny",
-            "igb-homogeneous-small",
-            "igb-homogeneous-medium",
+            "igb-hom-tiny",
+            "igb-hom-small",
+            "igb-hom-medium",
         ],
         help="The dataset we can use for node classification example. Currently"
         " ogbn-products, ogbn-arxiv, ogbn-papers100M and"

From df5c273c8b5eb37aadbafb189f69541753b5cf2b Mon Sep 17 00:00:00 2001
From: BowenYao18 <by18@rice.edu>
Date: Sat, 17 Aug 2024 23:08:31 +0000
Subject: [PATCH 04/10] process data in download.py

---
 examples/graphbolt/download.py            | 48 ++++++++++++++++-------
 examples/graphbolt/node_classification.py |  3 --
 2 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/examples/graphbolt/download.py b/examples/graphbolt/download.py
index 35da6fec58a0..4e4e7f3d1dcf 100644
--- a/examples/graphbolt/download.py
+++ b/examples/graphbolt/download.py
@@ -25,7 +25,16 @@ def build_yaml_helper(path, dataset_size, in_memory=True):
         ],
         "graph": {
             "edges": [
-                {"format": "numpy", "path": "edges/paper__cites__paper.npy"}
+                {
+                    "format": "numpy",
+                    "path": "edges/paper__cites__paper.npy",
+                    # "type": "paper:cites:paper"
+                },
+                # {
+                #     "format": "numpy",
+                #     "path": "edges/paper__cites__paper_rev.npy",
+                #     "type": "paper:rev_cites:paper"
+                # },
             ],
             "nodes": [{"num": num_nodes[dataset_size]["paper"]}],
         },
@@ -228,20 +237,21 @@ def check_md5sum(dataset_type, dataset_size, filename):
 
 def download_dataset(path, dataset_type, dataset_size):
     """This is the script to download all the related datasets."""
+    _dataset_type = dataset_type[:3]
 
     # For large datasets, use the two shell scripts to download.
     if dataset_size in ["large", "full"]:
         command = f"./download_{dataset_size}_igbh.sh"
         subprocess.run(["bash", command], check=True, text=True)
-        shutil.move(src=f"igb-{dataset_type}-{dataset_size}", dst=f"{path}")
-        return path + "/" + "igb-" + dataset_type + "-" + dataset_size
+        shutil.move(src=f"igb-{_dataset_type}-{dataset_size}", dst=f"{path}")
+        return path + "/" + "igb-" + _dataset_type + "-" + dataset_size
     # For the three smaller version, use the url to download.
     else:
         output_directory = path
         if not os.path.exists(
             output_directory
             + "igb_"
-            + dataset_type
+            + _dataset_type
             + "_"
             + dataset_size
             + ".tar.gz"
@@ -256,7 +266,7 @@ def download_dataset(path, dataset_type, dataset_size):
                 filename = (
                     path
                     + "/igb_"
-                    + dataset_type
+                    + _dataset_type
                     + "_"
                     + dataset_size
                     + ".tar.gz"
@@ -273,20 +283,20 @@ def download_dataset(path, dataset_type, dataset_size):
                         )
                         f.write(chunk)
             print(
-                "Downloaded" + " igb_" + dataset_type + "_" + dataset_size,
+                "Downloaded" + " igb_" + _dataset_type + "_" + dataset_size,
                 end=" ->",
             )
             check_md5sum(dataset_type, dataset_size, filename)
         else:  # No need to download the tar file again if it is already downloaded.
             print(
                 "The file igb_"
-                + dataset_type
+                + _dataset_type
                 + "_"
                 + dataset_size
                 + ".tar.gz already exists, directly extracting..."
             )
             filename = (
-                path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz"
+                path + "/igb_" + _dataset_type + "_" + dataset_size + ".tar.gz"
             )
         # Extract the tar file
         file = tarfile.open(filename)
@@ -301,10 +311,15 @@ def download_dataset(path, dataset_type, dataset_size):
         # os.remove(filename)
         os.rename(
             output_directory + "/" + dataset_size,
-            output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size,
+            output_directory
+            + "/"
+            + "igb-"
+            + _dataset_type
+            + "-"
+            + dataset_size,
         )
         return (
-            output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size
+            output_directory + "/" + "igb-" + _dataset_type + "-" + dataset_size
         )
 
 
@@ -363,9 +378,9 @@ def split_data(label_path, set_dir, dataset_size, class_num):
     print(validation_indices)
     print(test_indices)
 
-    train_labels = labels[:train_end]
-    validation_labels = labels[train_end:validation_end]
-    test_labels = labels[validation_end:]
+    train_labels = labels[:train_end].astype(np.int64)
+    validation_labels = labels[train_end:validation_end].astype(np.int64)
+    test_labels = labels[validation_end:].astype(np.int64)
     print(train_labels, len(train_labels))
     print(validation_labels, len(validation_labels))
     print(test_labels, len(test_labels))
@@ -395,16 +410,21 @@ def add_edges(edges, source, dest, dataset_size):
 
         old_edge_path = source + "/" + edge + "/" + "edge_index.npy"
         new_edge_path = dest + "/" + edge + ".npy"
+        rev_edge_path = dest + "/" + edge + "_rev.npy"
         os.rename(src=old_edge_path, dst=new_edge_path)
 
         # edge_array = np.memmap(new_edge_path, dtype='int32', mode='r',  shape=(num_edges[dataset_size][edge], 2))
         edge_array = np.load(new_edge_path)
         new_edge_array = edge_array.transpose()
+        rev_edge_array = new_edge_array[:, ::-1]
 
         assert new_edge_array.shape == (2, num_edges[dataset_size][edge])
-        assert np.array_equal(edge_array, new_edge_array.transpose())
+        assert rev_edge_array.shape == (2, num_edges[dataset_size][edge])
+        assert np.array_equal(new_edge_array, edge_array.transpose())
+        assert np.array_equal(rev_edge_array, new_edge_array[:, ::-1])
 
         gb.numpy_save_aligned(new_edge_path, new_edge_array)
+        gb.numpy_save_aligned(rev_edge_path, rev_edge_array)
 
 
 def process_feat(file_path, node_name, dataset_size):
diff --git a/examples/graphbolt/node_classification.py b/examples/graphbolt/node_classification.py
index 459b8a711af2..55e9d0cf3ecd 100644
--- a/examples/graphbolt/node_classification.py
+++ b/examples/graphbolt/node_classification.py
@@ -310,9 +310,6 @@ def train(args, graph, features, train_set, valid_set, num_classes, model):
             # in the last layer's computation graph.
             y = data.labels
 
-            if y.dtype != "int64":
-                y = y.long()
-
             y_hat = model(data.blocks, x)
 
             # Compute loss.

From 8a173ad74fb6a244df953cdda6eab7d9d87c89c8 Mon Sep 17 00:00:00 2001
From: BowenYao18 <by18@rice.edu>
Date: Sun, 18 Aug 2024 01:19:27 +0000
Subject: [PATCH 05/10] correct rev shape error

---
 examples/graphbolt/download.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/graphbolt/download.py b/examples/graphbolt/download.py
index 4e4e7f3d1dcf..717b56f93787 100644
--- a/examples/graphbolt/download.py
+++ b/examples/graphbolt/download.py
@@ -416,12 +416,12 @@ def add_edges(edges, source, dest, dataset_size):
         # edge_array = np.memmap(new_edge_path, dtype='int32', mode='r',  shape=(num_edges[dataset_size][edge], 2))
         edge_array = np.load(new_edge_path)
         new_edge_array = edge_array.transpose()
-        rev_edge_array = new_edge_array[:, ::-1]
+        rev_edge_array = np.ascontiguousarray(new_edge_array[::-1])
 
         assert new_edge_array.shape == (2, num_edges[dataset_size][edge])
         assert rev_edge_array.shape == (2, num_edges[dataset_size][edge])
         assert np.array_equal(new_edge_array, edge_array.transpose())
-        assert np.array_equal(rev_edge_array, new_edge_array[:, ::-1])
+        assert np.array_equal(rev_edge_array, new_edge_array[::-1])
 
         gb.numpy_save_aligned(new_edge_path, new_edge_array)
         gb.numpy_save_aligned(rev_edge_path, rev_edge_array)

From 0d9483698969d656feca4e76d1cc15b8167d160a Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-0-214.us-west-2.compute.internal>
Date: Thu, 29 Aug 2024 05:58:46 +0000
Subject: [PATCH 06/10] rm download.py, integrate in BuildInDataset

---
 examples/graphbolt/download.py              | 597 --------------------
 python/dgl/graphbolt/impl/ondisk_dataset.py |   6 +
 2 files changed, 6 insertions(+), 597 deletions(-)
 delete mode 100644 examples/graphbolt/download.py

diff --git a/examples/graphbolt/download.py b/examples/graphbolt/download.py
deleted file mode 100644
index 717b56f93787..000000000000
--- a/examples/graphbolt/download.py
+++ /dev/null
@@ -1,597 +0,0 @@
-import argparse, hashlib, os, shutil, tarfile, yaml
-import subprocess
-import urllib.request as ur
-
-import dgl.graphbolt as gb
-import numpy as np
-from tqdm import tqdm
-
-GBFACTOR = 1 << 30
-
-
-def build_yaml_helper(path, dataset_size, in_memory=True):
-    """The stirng to build the yaml file. (Still need modification)"""
-
-    data = {
-        "dataset_name": os.path.basename(path),
-        "feature_data": [
-            {
-                "domain": "node",
-                "format": "numpy",
-                "in_memory": in_memory,
-                "name": "feat",
-                "path": "data/paper_feat.npy",
-            }
-        ],
-        "graph": {
-            "edges": [
-                {
-                    "format": "numpy",
-                    "path": "edges/paper__cites__paper.npy",
-                    # "type": "paper:cites:paper"
-                },
-                # {
-                #     "format": "numpy",
-                #     "path": "edges/paper__cites__paper_rev.npy",
-                #     "type": "paper:rev_cites:paper"
-                # },
-            ],
-            "nodes": [{"num": num_nodes[dataset_size]["paper"]}],
-        },
-        "tasks": [
-            {
-                "name": "node_classification",
-                "num_classes": 19,
-                "test_set": [
-                    {
-                        "data": [
-                            {
-                                "format": "numpy",
-                                "in_memory": in_memory,
-                                "name": "seeds",
-                                "path": "set/test_indices_19.npy",
-                            },
-                            {
-                                "format": "numpy",
-                                "in_memory": in_memory,
-                                "name": "labels",
-                                "path": "set/test_labels_19.npy",
-                            },
-                        ]
-                    }
-                ],
-                "train_set": [
-                    {
-                        "data": [
-                            {
-                                "format": "numpy",
-                                "in_memory": in_memory,
-                                "name": "seeds",
-                                "path": "set/train_indices_19.npy",
-                            },
-                            {
-                                "format": "numpy",
-                                "in_memory": in_memory,
-                                "name": "labels",
-                                "path": "set/train_labels_19.npy",
-                            },
-                        ]
-                    }
-                ],
-                "validation_set": [
-                    {
-                        "data": [
-                            {
-                                "format": "numpy",
-                                "in_memory": in_memory,
-                                "name": "seeds",
-                                "path": "set/validation_indices_19.npy",
-                            },
-                            {
-                                "format": "numpy",
-                                "in_memory": in_memory,
-                                "name": "labels",
-                                "path": "set/validation_labels_19.npy",
-                            },
-                        ]
-                    }
-                ],
-            },
-            {
-                "name": "node_classification_2K",
-                "num_classes": 2983,
-                "test_set": [
-                    {
-                        "data": [
-                            {
-                                "format": "numpy",
-                                "in_memory": in_memory,
-                                "name": "seeds",
-                                "path": "set/test_indices_2983.npy",
-                            },
-                            {
-                                "format": "numpy",
-                                "in_memory": in_memory,
-                                "name": "labels",
-                                "path": "set/test_labels_2983.npy",
-                            },
-                        ]
-                    }
-                ],
-                "train_set": [
-                    {
-                        "data": [
-                            {
-                                "format": "numpy",
-                                "in_memory": in_memory,
-                                "name": "seeds",
-                                "path": "set/train_indices_2983.npy",
-                            },
-                            {
-                                "format": "numpy",
-                                "in_memory": in_memory,
-                                "name": "labels",
-                                "path": "set/train_labels_2983.npy",
-                            },
-                        ]
-                    }
-                ],
-                "validation_set": [
-                    {
-                        "data": [
-                            {
-                                "format": "numpy",
-                                "in_memory": in_memory,
-                                "name": "seeds",
-                                "path": "set/validation_indices_2983.npy",
-                            },
-                            {
-                                "format": "numpy",
-                                "in_memory": in_memory,
-                                "name": "labels",
-                                "path": "set/validation_labels_2983.npy",
-                            },
-                        ]
-                    }
-                ],
-            },
-        ],
-    }
-
-    return data
-
-
-def build_yaml(original_path, current_path, dataset_size):
-    """This build the yaml file differently based on the dataset size.
-    The two large datasets are put in disk while the other three smaller versions are in memory.
-    """
-    if "large" == dataset_size or "full" == dataset_size:
-        data = build_yaml_helper(
-            path=original_path, dataset_size=dataset_size, in_memory=False
-        )
-    else:
-        data = build_yaml_helper(path=original_path, dataset_size=dataset_size)
-    with open(f"{current_path}/metadata.yaml", "w") as file:
-        yaml.dump(data=data, stream=file, default_flow_style=False)
-
-
-dataset_urls = {
-    "homogeneous": {
-        "tiny": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_tiny.tar.gz",
-        "small": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_small.tar.gz",
-        "medium": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_medium.tar.gz",
-    },
-    "heterogeneous": {
-        "tiny": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_tiny.tar.gz",
-        "small": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_small.tar.gz",
-        "medium": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_medium.tar.gz",
-    },
-}
-
-
-md5checksums = {
-    "homogeneous": {
-        "tiny": "34856534da55419b316d620e2d5b21be",
-        "small": "6781c699723529902ace0a95cafe6fe4",
-        "medium": "4640df4ceee46851fd18c0a44ddcc622",
-    },
-    "heterogeneous": {
-        "tiny": "83fbc1091497ff92cf20afe82fae0ade",
-        "small": "2f42077be60a074aec24f7c60089e1bd",
-        "medium": "7f0df4296eca36553ff3a6a63abbd347",
-    },
-}
-
-
-def decide_download(url):
-    """An interactive command line to confirm download."""
-    d = ur.urlopen(url)
-    size = int(d.info()["Content-Length"]) / GBFACTOR
-    ### confirm if larger than 1GB
-    if size > 1:
-        return (
-            input(
-                "This will download %.2fGB. Will you proceed? (y/N) " % (size)
-            ).lower()
-            == "y"
-        )
-    else:
-        return True
-
-
-def check_md5sum(dataset_type, dataset_size, filename):
-    """This is for checking the data correctness of the downloaded datasets."""
-    original_md5 = md5checksums[dataset_type][dataset_size]
-
-    with open(filename, "rb") as file_to_check:
-        data = file_to_check.read()
-        md5_returned = hashlib.md5(data).hexdigest()
-
-    if original_md5 == md5_returned:
-        print(" md5sum verified.")
-        return
-    else:
-        os.remove(filename)
-        raise Exception(" md5sum verification failed!.")
-
-
-def download_dataset(path, dataset_type, dataset_size):
-    """This is the script to download all the related datasets."""
-    _dataset_type = dataset_type[:3]
-
-    # For large datasets, use the two shell scripts to download.
-    if dataset_size in ["large", "full"]:
-        command = f"./download_{dataset_size}_igbh.sh"
-        subprocess.run(["bash", command], check=True, text=True)
-        shutil.move(src=f"igb-{_dataset_type}-{dataset_size}", dst=f"{path}")
-        return path + "/" + "igb-" + _dataset_type + "-" + dataset_size
-    # For the three smaller version, use the url to download.
-    else:
-        output_directory = path
-        if not os.path.exists(
-            output_directory
-            + "igb_"
-            + _dataset_type
-            + "_"
-            + dataset_size
-            + ".tar.gz"
-        ):
-            url = dataset_urls[dataset_type][dataset_size]
-            if decide_download(url):
-                data = ur.urlopen(url)
-                size = int(data.info()["Content-Length"])
-                chunk_size = 1024 * 1024
-                num_iter = int(size / chunk_size) + 2
-                downloaded_size = 0
-                filename = (
-                    path
-                    + "/igb_"
-                    + _dataset_type
-                    + "_"
-                    + dataset_size
-                    + ".tar.gz"
-                )
-                with open(filename, "wb") as f:
-                    pbar = tqdm(range(num_iter))
-                    for i in pbar:
-                        chunk = data.read(chunk_size)
-                        downloaded_size += len(chunk)
-                        pbar.set_description(
-                            "Downloaded {:.2f} GB".format(
-                                float(downloaded_size) / GBFACTOR
-                            )
-                        )
-                        f.write(chunk)
-            print(
-                "Downloaded" + " igb_" + _dataset_type + "_" + dataset_size,
-                end=" ->",
-            )
-            check_md5sum(dataset_type, dataset_size, filename)
-        else:  # No need to download the tar file again if it is already downloaded.
-            print(
-                "The file igb_"
-                + _dataset_type
-                + "_"
-                + dataset_size
-                + ".tar.gz already exists, directly extracting..."
-            )
-            filename = (
-                path + "/igb_" + _dataset_type + "_" + dataset_size + ".tar.gz"
-            )
-        # Extract the tar file
-        file = tarfile.open(filename)
-        file.extractall(output_directory)
-        file.close()
-        size = 0
-        for path, dirs, files in os.walk(output_directory + "/" + dataset_size):
-            for f in files:
-                fp = os.path.join(path, f)
-                size += os.path.getsize(fp)
-        print("Final dataset size {:.2f} GB.".format(size / GBFACTOR))
-        # os.remove(filename)
-        os.rename(
-            output_directory + "/" + dataset_size,
-            output_directory
-            + "/"
-            + "igb-"
-            + _dataset_type
-            + "-"
-            + dataset_size,
-        )
-        return (
-            output_directory + "/" + "igb-" + _dataset_type + "-" + dataset_size
-        )
-
-
-num_nodes = {
-    "full": {
-        "paper": 269346174,
-    },
-    "large": {
-        "paper": 100000000,
-    },
-    "medium": {
-        "paper": 10000000,
-    },
-    "small": {
-        "paper": 1000000,
-    },
-    "tiny": {
-        "paper": 100000,
-    },
-}
-
-num_edges = {
-    "full": {
-        "paper__cites__paper": 3996442004,
-    },
-    "large": {
-        "paper__cites__paper": 1223571364,
-    },
-    "medium": {
-        "paper__cites__paper": 120077694,
-    },
-    "small": {
-        "paper__cites__paper": 12070502,
-    },
-    "tiny": {
-        "paper__cites__paper": 447416,
-    },
-}
-
-
-def split_data(label_path, set_dir, dataset_size, class_num):
-    """This is for splitting the labels into three sets: train, validation, and test sets."""
-    # labels = np.memmap(label_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
-    labels = np.load(label_path)
-
-    total_samples = len(labels)
-    train_end = int(0.6 * total_samples)
-    validation_end = int(0.8 * total_samples)
-
-    indices = np.arange(total_samples)
-    train_indices = indices[:train_end]
-    validation_indices = indices[train_end:validation_end]
-    test_indices = indices[validation_end:]
-    print(indices)
-    print(train_indices)
-    print(validation_indices)
-    print(test_indices)
-
-    train_labels = labels[:train_end].astype(np.int64)
-    validation_labels = labels[train_end:validation_end].astype(np.int64)
-    test_labels = labels[validation_end:].astype(np.int64)
-    print(train_labels, len(train_labels))
-    print(validation_labels, len(validation_labels))
-    print(test_labels, len(test_labels))
-
-    gb.numpy_save_aligned(
-        f"{set_dir}/train_indices_{class_num}.npy", train_indices
-    )
-    gb.numpy_save_aligned(
-        f"{set_dir}/validation_indices_{class_num}.npy", validation_indices
-    )
-    gb.numpy_save_aligned(
-        f"{set_dir}/test_indices_{class_num}.npy", test_indices
-    )
-    gb.numpy_save_aligned(
-        f"{set_dir}/train_labels_{class_num}.npy", train_labels
-    )
-    gb.numpy_save_aligned(
-        f"{set_dir}/validation_labels_{class_num}.npy", validation_labels
-    )
-    gb.numpy_save_aligned(f"{set_dir}/test_labels_{class_num}.npy", test_labels)
-
-
-def add_edges(edges, source, dest, dataset_size):
-    """This is for processing the edges in the graph and convert them to correct shape."""
-    for edge in edges:
-        print(f"\t Processing {edge} edge...")
-
-        old_edge_path = source + "/" + edge + "/" + "edge_index.npy"
-        new_edge_path = dest + "/" + edge + ".npy"
-        rev_edge_path = dest + "/" + edge + "_rev.npy"
-        os.rename(src=old_edge_path, dst=new_edge_path)
-
-        # edge_array = np.memmap(new_edge_path, dtype='int32', mode='r',  shape=(num_edges[dataset_size][edge], 2))
-        edge_array = np.load(new_edge_path)
-        new_edge_array = edge_array.transpose()
-        rev_edge_array = np.ascontiguousarray(new_edge_array[::-1])
-
-        assert new_edge_array.shape == (2, num_edges[dataset_size][edge])
-        assert rev_edge_array.shape == (2, num_edges[dataset_size][edge])
-        assert np.array_equal(new_edge_array, edge_array.transpose())
-        assert np.array_equal(rev_edge_array, new_edge_array[::-1])
-
-        gb.numpy_save_aligned(new_edge_path, new_edge_array)
-        gb.numpy_save_aligned(rev_edge_path, rev_edge_array)
-
-
-def process_feat(file_path, node_name, dataset_size):
-    """This is for processing the node features."""
-    # array = np.memmap(file_path, dtype='float32', mode='r',  shape=(num_nodes[dataset_size][node_name], 1024))
-    array = np.load(file_path)
-    assert array.shape == (num_nodes[dataset_size][node_name], 1024)
-    gb.numpy_save_aligned(file_path, array)
-
-    # Assert the shape and elements of the array are correct
-    # new_array = np.memmap(file_path, dtype='float32', mode='r',  shape=(num_nodes[dataset_size][node_name], 1024))
-    new_array = np.load(file_path)
-    assert new_array.shape == (num_nodes[dataset_size][node_name], 1024)
-    assert np.array_equal(array, new_array)
-
-
-def process_label(file_path, num_class, dataset_size):
-    """This is for processing the node labels."""
-    if (
-        num_class == 2983 and dataset_size == "full"
-    ):  # only this case label number changes
-        # array = np.memmap(file_path, dtype='int32', mode='r',  shape=(227130858, 1))
-        array = np.load(file_path)
-        assert array.shape[0] == 227130858
-    else:
-        # array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
-        array = np.load(file_path)
-        assert array.shape[0] == num_nodes[dataset_size]["paper"]
-
-    gb.numpy_save_aligned(file_path, array)
-
-    # Assert the shape and elements of the array are correct
-    if num_class == 2983 and dataset_size == "full":
-        # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(227130858, 1))
-        new_array = np.load(file_path)
-        assert new_array.shape[0] == 227130858
-        assert np.array_equal(array, new_array)
-    else:
-        # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
-        new_array = np.load(file_path)
-        assert new_array.shape[0] == num_nodes[dataset_size]["paper"]
-        assert np.array_equal(array, new_array)
-
-
-def add_nodes(nodes, source, dest, dataset_size):
-    """This is for processing the nodes in the graph and store them in correct format."""
-    for node in nodes:
-        print(f"\t Processing {node} node feature...")
-        old_node_path = source + "/" + node + "/" + "node_feat.npy"
-        new_node_path = dest + "/" + node + "_feat.npy"
-        os.rename(src=old_node_path, dst=new_node_path)
-        process_feat(
-            file_path=new_node_path, node_name=node, dataset_size=dataset_size
-        )
-        # If the node is a paper type, process the labels
-        if node == "paper":
-            print(f"\t Processing {node} labels...")
-            old_label_path_19 = source + "/" + node + "/" + "node_label_19.npy"
-            new_label_path_19 = dest + "/" + "paper_label_19.npy"
-            os.rename(src=old_label_path_19, dst=new_label_path_19)
-            process_label(
-                file_path=new_label_path_19,
-                num_class=19,
-                dataset_size=dataset_size,
-            )
-
-            old_label_path_2K = source + "/" + node + "/" + "node_label_2K.npy"
-            new_label_path_2K = dest + "/" + "paper_label_2K.npy"
-            os.rename(src=old_label_path_2K, dst=new_label_path_2K)
-            process_label(
-                file_path=new_label_path_2K,
-                num_class=2983,
-                dataset_size=dataset_size,
-            )
-
-    return new_label_path_19, new_label_path_2K
-
-
-def process_dataset(path, dataset_size):
-    print(f"Starting to process the {dataset_size} dataset...")
-
-    # Step 0: Make the directory for processed dataset
-    processed_dir = path + "-seeds"
-    os.makedirs(name=processed_dir, exist_ok=True)
-    original_path = path + "/" + "processed"
-
-    # Step 1: Move Nodes files
-    print("Processing Node files...")
-    node_dir = processed_dir + "/" + "data"
-    os.makedirs(name=node_dir, exist_ok=True)
-    # These are the one node in this homogeneous citation network
-    nodes = ["paper"]
-    label_file_19, label_file_2K = add_nodes(
-        nodes=nodes,
-        source=original_path,
-        dest=node_dir,
-        dataset_size=dataset_size,
-    )
-
-    # Step 2: Create labels
-    print("Processing train/valid/test files...")
-    set_dir = processed_dir + "/" + "set"
-    os.makedirs(name=set_dir, exist_ok=True)
-    split_data(
-        label_path=label_file_19,
-        set_dir=set_dir,
-        dataset_size=dataset_size,
-        class_num=19,
-    )
-    split_data(
-        label_path=label_file_2K,
-        set_dir=set_dir,
-        dataset_size=dataset_size,
-        class_num=2983,
-    )
-
-    # Step 3: Move edge files
-    print("Processing Edge files...")
-    edge_dir = processed_dir + "/" + "edges"
-    os.makedirs(name=edge_dir, exist_ok=True)
-    # These are the one edge in this homogeneous citation network
-    edges = [
-        "paper__cites__paper",
-    ]
-    add_edges(
-        edges=edges,
-        source=original_path,
-        dest=edge_dir,
-        dataset_size=dataset_size,
-    )
-
-    # Step 4: Build the yaml file
-    print("Building yaml file...")
-    build_yaml(
-        original_path=path,
-        current_path=processed_dir,
-        dataset_size=dataset_size,
-    )
-
-    # shutil.rmtree(path)
-    print(f"Finished processing the {dataset_size} dataset")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--path",
-        type=str,
-        default="datasets/",
-        help="path to store the datasets",
-    )
-    parser.add_argument(
-        "--type",
-        type=str,
-        default="homogeneous",
-        choices=["homogeneous", "heterogeneous"],
-        help="dataset type",
-    )
-    parser.add_argument(
-        "--size",
-        type=str,
-        default="tiny",
-        choices=["tiny", "small", "medium"],
-        help="size of the datasets",
-    )
-    args = parser.parse_args()
-    path = download_dataset(
-        path=args.path, dataset_type=args.type, dataset_size=args.size
-    )
-    process_dataset(path=path, dataset_size=args.size)
diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index d669dc825509..10c8f395fd14 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -1004,12 +1004,18 @@ class BuiltinDataset(OnDiskDataset):
         "ogbn-products-seeds",
         "ogbn-arxiv",
         "ogbn-arxiv-seeds",
+        "igb-hom-tiny",
+        "igb-hom-tiny-seeds",
+        "igb-hom-small",
+        "igb-hom-small-seeds",
     ]
     _large_datasets = [
         "ogb-lsc-mag240m",
         "ogb-lsc-mag240m-seeds",
         "ogbn-papers100M",
         "ogbn-papers100M-seeds",
+        "igb-hom-medium",
+        "igb-hom-medium-seeds",
     ]
     _all_datasets = _datasets + _large_datasets
 

From 1c65a7b7f6a2c0acc8b283a92c55d3be90c39802 Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 29 Aug 2024 11:32:04 -0500
Subject: [PATCH 07/10] Update ondisk_dataset.py doc string

---
 python/dgl/graphbolt/impl/ondisk_dataset.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index 10c8f395fd14..d2a60e235c2c 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -979,6 +979,16 @@ class BuiltinDataset(OnDiskDataset):
         .. note::
             Reverse edges are added to the original graph.
 
+    **igb-hom-[tiny|small|medium]**
+        The igb-hom-[tiny|small|medium] dataset is a homogeneous citation network,
+        which is designed for developers to train and evaluate GNN models with high 
+        fidelity. See more details in `igb-hom-[tiny|small|medium] 
+        <https://mlcommons.org/2024/06/gnn-for-mlperf-training-v4/>`_.
+
+        .. note::
+            Self edges are added to the original graph.
+            Node features are stored as float32.
+
     Parameters
     ----------
     name : str

From f125d49ebc9f704fba1890f5dc0c14e587622e66 Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 29 Aug 2024 11:35:14 -0500
Subject: [PATCH 08/10] Update ondisk_dataset.py lint

---
 python/dgl/graphbolt/impl/ondisk_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index d2a60e235c2c..002128b9fa0d 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -981,8 +981,8 @@ class BuiltinDataset(OnDiskDataset):
 
     **igb-hom-[tiny|small|medium]**
         The igb-hom-[tiny|small|medium] dataset is a homogeneous citation network,
-        which is designed for developers to train and evaluate GNN models with high 
-        fidelity. See more details in `igb-hom-[tiny|small|medium] 
+        which is designed for developers to train and evaluate GNN models with 
+        high fidelity. See more details in `igb-hom-[tiny|small|medium] 
         <https://mlcommons.org/2024/06/gnn-for-mlperf-training-v4/>`_.
 
         .. note::

From b9616a140c071527d461eb0e6304a3799146b4f0 Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 29 Aug 2024 11:38:20 -0500
Subject: [PATCH 09/10] Update examples/graphbolt/node_classification.py

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 examples/graphbolt/node_classification.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/graphbolt/node_classification.py b/examples/graphbolt/node_classification.py
index 55e9d0cf3ecd..6b0dee719746 100644
--- a/examples/graphbolt/node_classification.py
+++ b/examples/graphbolt/node_classification.py
@@ -37,7 +37,6 @@
 │
 └───> All nodes set inference & Test set evaluation
 """
-
 import argparse
 import time
 

From 4f74219d78d57895a4603b430ae53bea6f5de148 Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 29 Aug 2024 11:40:31 -0500
Subject: [PATCH 10/10] Update ondisk_dataset.py on URL

---
 python/dgl/graphbolt/impl/ondisk_dataset.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index 002128b9fa0d..df3b51f8b074 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -981,9 +981,9 @@ class BuiltinDataset(OnDiskDataset):
 
     **igb-hom-[tiny|small|medium]**
         The igb-hom-[tiny|small|medium] dataset is a homogeneous citation network,
-        which is designed for developers to train and evaluate GNN models with 
-        high fidelity. See more details in `igb-hom-[tiny|small|medium] 
-        <https://mlcommons.org/2024/06/gnn-for-mlperf-training-v4/>`_.
+        which is designed for developers to train and evaluate GNN models with
+        high fidelity. See more details in `igb-hom-[tiny|small|medium]
+        <https://github.com/IllinoisGraphBenchmark/IGB-Datasets>`_.
 
         .. note::
             Self edges are added to the original graph.