Merge pull request #218 from Ying-1106/main

add dataset from Neo4j
BUPT-GAMMA · Feb 6, 2024 · 21279c5 · 21279c5
2 parents 8abf8fa + dc20078
commit 21279c5
Show file tree

Hide file tree

Showing 9 changed files with 244 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -169,14 +169,30 @@ cd OpenHGNN
 pip install .
 ```
 
+
+
+**5. 安装 gdbi(可选):** 
+
+- 安装gdbi
+```bash
+pip install git+https://github.com/xy-Ji/gdbi.git
+```
+
+- 安装图数据库
+```bash
+pip install neo4j==5.16.0
+pip install nebula3-python==3.4.0
+```
+
+
 #### 在已有的评测上运行已有的基线模型 [数据集](./openhgnn/dataset/#Dataset)
 
 ```bash
-python main.py -m model_name -d dataset_name -t task_name -g 0 --use_best_config --load_from_pretrained
+python main.py -m model_name -d dataset_name -t task_name -g 0 --use_best_config --load_from_pretrained 
 ```
 
 使用方法: main.py [-h] [--model MODEL] [--task TASK] [--dataset DATASET]
-               [--gpu GPU] [--use_best_config]
+               [--gpu GPU] [--use_best_config][--use_database]
 
 *可选参数*:
 
@@ -194,6 +210,8 @@ python main.py -m model_name -d dataset_name -t task_name -g 0 --use_best_config
 
 ``--load_from_pretrained`` 从默认检查点加载模型。
 
+``--use_database`` 从数据库加载数据集
+
 示例: 
 
 ```bash
@@ -204,6 +222,7 @@ python main.py -m GTN -d imdb4GTN -t node_classification -g 0 --use_best_config
 
 请参考 [文档](https://openhgnn.readthedocs.io/en/latest/index.html) 了解更多的基础和进阶的使用方法。
 
+
 #### 使用TensorBoard可视化训练结果
 ```bash
 tensorboard --logdir=./openhgnn/output/{model_name}/
@@ -212,8 +231,32 @@ tensorboard --logdir=./openhgnn/output/{model_name}/
 ```bash
 tensorboard --logdir=./openhgnn/output/RGCN/
 ```
+
 **提示**:需要先运行一次你想要可视化的模型，才能用以上命令可视化结果。
 
+#### 使用gdbi访问数据库中的标准图数据
+以neo4j数据库和imdb数据集为例
+- 构造图数据集的csv文件(节点级:A.csv，连接级:A_P.csv)
+- 导入csv文件到图数据库中
+```bash
+LOAD CSV WITH HEADERS FROM "file:///data.csv" AS row
+CREATE (:graphname_labelname {ID: row.ID, ... });
+```
+- 在config.py文件中添加访问图数据库所需的用户信息
+```python
+self.graph_address = [graph_address]
+self.user_name = [user_name]
+self.password = [password]
+```
+
+- 示例: 
+
+```bash
+python main.py -m MAGNN -d imdb4MAGNN -t node_classification -g 0 --use_best_config --use_database
+```
+
+
+
 ## [模型](./openhgnn/models/#Model)
 
 ### 特定任务下支持的模型

diff --git a/README_EN.md b/README_EN.md
@@ -171,14 +171,32 @@ cd OpenHGNN
 pip install .
 ```
 
+
+**5. Install gdbi(Optional):** 
+
+- install gdbi from git
+```bash
+pip install git+https://github.com/xy-Ji/gdbi.git
+```
+
+- install graph database from pypi
+```bash
+pip install neo4j==5.16.0
+pip install nebula3-python==3.4.0
+```
+
+
+
+
+
 #### Running an existing baseline model on an existing benchmark [dataset](../openhgnn/dataset/#Dataset)
 
 ```bash
 python main.py -m model_name -d dataset_name -t task_name -g 0 --use_best_config --load_from_pretrained
 ```
 
 usage: main.py [-h] [--model MODEL] [--task TASK] [--dataset DATASET]
-[--gpu GPU] [--use_best_config]
+[--gpu GPU] [--use_best_config][--use_database]
 
 *optional arguments*:
 
@@ -198,6 +216,8 @@ will override the parameter in config.ini.
 
 ``--load_from_pretrained`` will load the model from a default checkpoint.
 
+``--use_database`` get dataset from database
+
 e.g.:
 
 ```bash
@@ -218,6 +238,31 @@ tensorboard --logdir=./openhgnn/output/RGCN/
 ```
 **Note**: To visualize results, you need to train the model first.
 
+
+#### Use gdbi to get grpah dataset
+take neo4j and imdb dataset for example
+- construct csv file for dataset(node-level:A.csv,edge-level:A_P.csv)
+- import csv file to database
+```bash
+LOAD CSV WITH HEADERS FROM "file:///data.csv" AS row
+CREATE (:graphname_labelname {ID: row.ID, ... });
+```
+- add user information to access database in config.py file
+```python
+self.graph_address = [graph_address]
+self.user_name = [user_name]
+self.password = [password]
+```
+
+- e.g.:
+
+```bash
+python main.py -m MAGNN -d imdb4MAGNN -t node_classification -g 0 --use_best_config --use_database
+```
+
+
+
+
 ## [Models](../openhgnn/models/#Model)
 
 ### Supported Models with specific task

diff --git a/main.py b/main.py
@@ -16,6 +16,7 @@
     parser.add_argument('--gpu', '-g', default='-1', type=int, help='-1 means cpu')
     parser.add_argument('--use_best_config', action='store_true', help='will load utils.best_config')
     parser.add_argument('--load_from_pretrained', action='store_true', help='load model from the checkpoint')
+    parser.add_argument('--use_database',action='store_true',help = 'use database')
     args = parser.parse_args()
 
     experiment = Experiment(model=args.model, dataset=args.dataset, task=args.task, gpu=args.gpu,

diff --git a/openhgnn/config.py b/openhgnn/config.py
@@ -327,6 +327,11 @@ def __init__(self, file_path, model, dataset, task, gpu):
             self.ff_layer = conf.getint('NARS', 'ff_layer')
 
         elif self.model_name == 'MAGNN':
+
+            self.graph_address = ''
+            self.user_name = ''
+            self.password = ''
+
             self.lr = conf.getfloat("MAGNN", "learning_rate")
             self.weight_decay = conf.getfloat("MAGNN", "weight_decay")
             self.seed = conf.getint("MAGNN", "seed")

diff --git a/openhgnn/dataset/NodeClassificationDataset.py b/openhgnn/dataset/NodeClassificationDataset.py
@@ -10,7 +10,7 @@
 from ogb.nodeproppred import DglNodePropPredDataset
 from . import load_acm_raw
 from . import BaseDataset, register_dataset
-from . import AcademicDataset, HGBDataset, OHGBDataset
+from . import AcademicDataset, HGBDataset, OHGBDataset,IMDB4MAGNN_Dataset
 from .utils import sparse_mx_to_torch_sparse_tensor
 from ..utils import add_reverse_edges
 
@@ -181,8 +181,15 @@ class HIN_NodeClassification(NodeClassificationDataset):
 
     def __init__(self, dataset_name, *args, **kwargs):
         super(HIN_NodeClassification, self).__init__(*args, **kwargs)
+
+        if 'args' in kwargs:  
+            self.args = kwargs['args']
+        else:
+            self.args = None
+
         self.g, self.category, self.num_classes = self.load_HIN(dataset_name)
 
+
     def load_HIN(self, name_dataset):
         if name_dataset == 'demo_graph':
             data_path = './openhgnn/dataset/demo_graph.bin'
@@ -211,7 +218,12 @@ def load_HIN(self, name_dataset):
             self.in_dim = g.ndata['h'][category].shape[1]
 
         elif name_dataset == 'imdb4MAGNN':
-            dataset = AcademicDataset(name='imdb4MAGNN', raw_dir='')
+
+            if self.args.use_database == True:
+                dataset = IMDB4MAGNN_Dataset(name='imdb4MAGNN',args = self.args)
+            else:
+                dataset = AcademicDataset(name='imdb4MAGNN', raw_dir='')
+
             category = 'M'
             g = dataset[0].long()
             num_classes = 3

diff --git a/openhgnn/dataset/__init__.py b/openhgnn/dataset/__init__.py
@@ -2,7 +2,7 @@
 from dgl.data import DGLDataset
 from .base_dataset import BaseDataset
 from .utils import load_acm, load_acm_raw, generate_random_hg
-from .academic_graph import AcademicDataset
+from .academic_graph import AcademicDataset,IMDB4MAGNN_Dataset
 from .hgb_dataset import HGBDataset
 from .ohgb_dataset import OHGBDataset
 from .gtn_dataset import *
@@ -115,6 +115,10 @@ def build_dataset(dataset, task, *args, **kwargs):
                      'Book-Crossing', 'amazon4SLICE', 'MTWM', 'HNE-PubMed', 'HGBl-ACM', 'HGBl-DBLP', 'HGBl-IMDB',
                      'amazon', 'yelp4HGSL']:
         _dataset = 'hin_' + task
+    elif dataset in ['imdb4MAGNN']:
+        _dataset = 'hin_' + task
+        return DATASET_REGISTRY[_dataset](dataset, logger=kwargs['logger'],
+                                          args = kwargs['args'] )
     elif dataset in ohgbn_datasets + ohgbl_datasets:
         _dataset = 'ohgb_' + task
     elif dataset in ['ogbn-mag']:

diff --git a/openhgnn/dataset/academic_graph.py b/openhgnn/dataset/academic_graph.py
@@ -9,6 +9,130 @@
 import torch as th
 
 
+
+# get dataset from database
+class IMDB4MAGNN_Dataset(DGLDataset):
+
+    def __init__(self, name, args, raw_dir=None, force_reload=False, verbose=True):
+        assert name in ['imdb4MAGNN', ]
+
+        self.args = args 
+        super(IMDB4MAGNN_Dataset, self).__init__(name=name,
+                                        url=None,
+                                        raw_dir=None, 
+                                        force_reload=force_reload,
+                                        verbose=verbose)
+
+
+    def download(self):
+
+        from gdbi import NodeExportConfig, EdgeExportConfig, Neo4jInterface, NebulaInterface
+        node_export_config = [
+            NodeExportConfig('A', ['attribute'] ),
+            NodeExportConfig('M', ['attribute'], ['label']), 
+            NodeExportConfig('D', ['attribute'])  
+        ]
+
+        edge_export_config = [
+            EdgeExportConfig('A_M', ('A','M')),
+            EdgeExportConfig('M_A', ('M','A')),
+            EdgeExportConfig('M_D', ('M','D')),
+            EdgeExportConfig('D_M', ('D','M'))    
+        ]
+
+        # neo4j 
+        graph_database = Neo4jInterface()
+
+        # # nebula 
+        # graph_database = NebulaInterface()
+
+        graph_address = self.args.graph_address
+        user_name = self.args.user_name
+        password = self.args.password
+
+        conn = graph_database.GraphDBConnection(graph_address, user_name, password)
+        self.graph = graph_database.get_graph(conn, 'imdb4MAGNN', node_export_config, edge_export_config)
+
+
+
+
+    def process(self):
+
+        graph = self.graph
+        cano_edges = {}
+        for edge_type in graph['edge_index_dict'].keys():  # 'A_M'
+            src_type = edge_type[0] # A
+            dst_type = edge_type[-1]    # M
+            edge_type_2 = src_type + '-' + dst_type # A-M
+
+            cano_edge_type = (src_type,edge_type_2,dst_type)  # ('A','A-M','M')
+            u,v = graph['edge_index_dict'][edge_type][0] ,graph['edge_index_dict'][edge_type][1] 
+
+            cano_edges[cano_edge_type] = (u,v)
+
+
+
+        hg = dgl.heterograph(cano_edges)
+
+        for node_type in graph['X_dict'].keys() :   
+            hg.nodes[node_type].data['h'] = graph['X_dict'][node_type]  
+            if node_type == 'M':
+                hg.nodes[node_type].data['labels'] = graph['Y_dict'][node_type]  
+
+        import torch
+
+
+
+        num_nodes = 4278
+        random_indices = torch.randperm(num_nodes)
+
+        num_train = 400
+        num_val = 400
+        num_test = 3478
+
+        train_mask = torch.zeros(num_nodes, dtype=torch.int)
+        train_mask[random_indices[:num_train]] = 1
+        val_mask = torch.zeros(num_nodes, dtype=torch.int)
+        val_mask[random_indices[num_train:num_train+num_val]] = 1
+        test_mask = torch.zeros(num_nodes, dtype=torch.int)
+        test_mask[random_indices[num_train+num_val:]] = 1
+
+        assert torch.sum(train_mask * val_mask) == 0
+        assert torch.sum(train_mask * test_mask) == 0
+        assert torch.sum(val_mask * test_mask) == 0
+
+        hg.nodes['M'].data['train_mask'] = train_mask
+        hg.nodes['M'].data['val_mask'] = val_mask
+        hg.nodes['M'].data['test_mask'] = test_mask
+
+        self._g = hg
+
+
+
+
+    def __getitem__(self, idx):
+        # get one example by index
+        assert idx == 0, "This dataset has only one graph" 
+        return self._g
+
+    def __len__(self):
+        return 1
+
+
+    def save(self):
+        pass
+
+    def load(self):
+        pass
+
+    def has_cache(self):
+        pass
+
+
+
+
+
+
 class AcademicDataset(DGLDataset):
 
     _prefix = 'https://s3.cn-north-1.amazonaws.com.cn/dgl-data/'

diff --git a/openhgnn/experiment.py b/openhgnn/experiment.py
@@ -85,13 +85,15 @@ def __init__(self, model, dataset, task,
                  hpo_trials: int = 100,
                  output_dir: str = "./openhgnn/output",
                  conf_path: str = default_conf_path,
+                 use_database:bool = False,
                  **kwargs):
         self.config = Config(file_path=conf_path, model=model, dataset=dataset, task=task, gpu=gpu)
         self.config.model = model
         self.config.dataset = dataset
         self.config.task = task
         self.config.gpu = gpu
         self.config.use_best_config = use_best_config
+        self.config.use_database = use_database
         # self.config.use_hpo = use_hpo
         self.config.load_from_pretrained = load_from_pretrained
         self.config.output_dir = os.path.join(output_dir, self.config.model_name)

diff --git a/openhgnn/tasks/node_classification.py b/openhgnn/tasks/node_classification.py
@@ -31,7 +31,8 @@ class NodeClassification(BaseTask):
     def __init__(self, args):
         super(NodeClassification, self).__init__()
         self.logger = args.logger
-        self.dataset = build_dataset(args.dataset, 'node_classification', logger=self.logger)
+        self.dataset = build_dataset(args.dataset, 'node_classification', 
+                                     logger=self.logger,args = args)
         # self.evaluator = Evaluator()
         self.logger = args.logger
         if hasattr(args, 'validation'):