diff --git a/README.md b/README.md index da59c176..404011de 100644 --- a/README.md +++ b/README.md @@ -169,14 +169,30 @@ cd OpenHGNN pip install . ``` + + +**5. 安装 gdbi(可选):** + +- 安装gdbi +```bash +pip install git+https://github.com/xy-Ji/gdbi.git +``` + +- 安装图数据库 +```bash +pip install neo4j==5.16.0 +pip install nebula3-python==3.4.0 +``` + + #### 在已有的评测上运行已有的基线模型 [数据集](./openhgnn/dataset/#Dataset) ```bash -python main.py -m model_name -d dataset_name -t task_name -g 0 --use_best_config --load_from_pretrained +python main.py -m model_name -d dataset_name -t task_name -g 0 --use_best_config --load_from_pretrained ``` 使用方法: main.py [-h] [--model MODEL] [--task TASK] [--dataset DATASET] - [--gpu GPU] [--use_best_config] + [--gpu GPU] [--use_best_config][--use_database] *可选参数*: @@ -194,6 +210,8 @@ python main.py -m model_name -d dataset_name -t task_name -g 0 --use_best_config ``--load_from_pretrained`` 从默认检查点加载模型。 +``--use_database`` 从数据库加载数据集 + 示例: ```bash @@ -204,6 +222,7 @@ python main.py -m GTN -d imdb4GTN -t node_classification -g 0 --use_best_config 请参考 [文档](https://openhgnn.readthedocs.io/en/latest/index.html) 了解更多的基础和进阶的使用方法。 + #### 使用TensorBoard可视化训练结果 ```bash tensorboard --logdir=./openhgnn/output/{model_name}/ @@ -212,8 +231,32 @@ tensorboard --logdir=./openhgnn/output/{model_name}/ ```bash tensorboard --logdir=./openhgnn/output/RGCN/ ``` + **提示**:需要先运行一次你想要可视化的模型,才能用以上命令可视化结果。 +#### 使用gdbi访问数据库中的标准图数据 +以neo4j数据库和imdb数据集为例 +- 构造图数据集的csv文件(节点级:A.csv,连接级:A_P.csv) +- 导入csv文件到图数据库中 +```bash +LOAD CSV WITH HEADERS FROM "file:///data.csv" AS row +CREATE (:graphname_labelname {ID: row.ID, ... }); +``` +- 在config.py文件中添加访问图数据库所需的用户信息 +```python +self.graph_address = [graph_address] +self.user_name = [user_name] +self.password = [password] +``` + +- 示例: + +```bash +python main.py -m MAGNN -d imdb4MAGNN -t node_classification -g 0 --use_best_config --use_database +``` + + + ## [模型](./openhgnn/models/#Model) ### 特定任务下支持的模型 diff --git a/README_EN.md b/README_EN.md index 728053a2..c23bc783 100644 --- a/README_EN.md +++ b/README_EN.md @@ -171,6 +171,24 @@ cd OpenHGNN pip install . ``` + +**5. Install gdbi(Optional):** + +- install gdbi from git +```bash +pip install git+https://github.com/xy-Ji/gdbi.git +``` + +- install graph database from pypi +```bash +pip install neo4j==5.16.0 +pip install nebula3-python==3.4.0 +``` + + + + + #### Running an existing baseline model on an existing benchmark [dataset](../openhgnn/dataset/#Dataset) ```bash @@ -178,7 +196,7 @@ python main.py -m model_name -d dataset_name -t task_name -g 0 --use_best_config ``` usage: main.py [-h] [--model MODEL] [--task TASK] [--dataset DATASET] -[--gpu GPU] [--use_best_config] +[--gpu GPU] [--use_best_config][--use_database] *optional arguments*: @@ -198,6 +216,8 @@ will override the parameter in config.ini. ``--load_from_pretrained`` will load the model from a default checkpoint. +``--use_database`` get dataset from database + e.g.: ```bash @@ -218,6 +238,31 @@ tensorboard --logdir=./openhgnn/output/RGCN/ ``` **Note**: To visualize results, you need to train the model first. + +#### Use gdbi to get grpah dataset +take neo4j and imdb dataset for example +- construct csv file for dataset(node-level:A.csv,edge-level:A_P.csv) +- import csv file to database +```bash +LOAD CSV WITH HEADERS FROM "file:///data.csv" AS row +CREATE (:graphname_labelname {ID: row.ID, ... }); +``` +- add user information to access database in config.py file +```python +self.graph_address = [graph_address] +self.user_name = [user_name] +self.password = [password] +``` + +- e.g.: + +```bash +python main.py -m MAGNN -d imdb4MAGNN -t node_classification -g 0 --use_best_config --use_database +``` + + + + ## [Models](../openhgnn/models/#Model) ### Supported Models with specific task diff --git a/main.py b/main.py index 17c07321..733a3282 100644 --- a/main.py +++ b/main.py @@ -16,6 +16,7 @@ parser.add_argument('--gpu', '-g', default='-1', type=int, help='-1 means cpu') parser.add_argument('--use_best_config', action='store_true', help='will load utils.best_config') parser.add_argument('--load_from_pretrained', action='store_true', help='load model from the checkpoint') + parser.add_argument('--use_database',action='store_true',help = 'use database') args = parser.parse_args() experiment = Experiment(model=args.model, dataset=args.dataset, task=args.task, gpu=args.gpu, diff --git a/openhgnn/config.py b/openhgnn/config.py index a7541ac7..eb56d319 100644 --- a/openhgnn/config.py +++ b/openhgnn/config.py @@ -327,6 +327,11 @@ def __init__(self, file_path, model, dataset, task, gpu): self.ff_layer = conf.getint('NARS', 'ff_layer') elif self.model_name == 'MAGNN': + + self.graph_address = '' + self.user_name = '' + self.password = '' + self.lr = conf.getfloat("MAGNN", "learning_rate") self.weight_decay = conf.getfloat("MAGNN", "weight_decay") self.seed = conf.getint("MAGNN", "seed") diff --git a/openhgnn/dataset/NodeClassificationDataset.py b/openhgnn/dataset/NodeClassificationDataset.py index d1db91fb..be2e473a 100644 --- a/openhgnn/dataset/NodeClassificationDataset.py +++ b/openhgnn/dataset/NodeClassificationDataset.py @@ -10,7 +10,7 @@ from ogb.nodeproppred import DglNodePropPredDataset from . import load_acm_raw from . import BaseDataset, register_dataset -from . import AcademicDataset, HGBDataset, OHGBDataset +from . import AcademicDataset, HGBDataset, OHGBDataset,IMDB4MAGNN_Dataset from .utils import sparse_mx_to_torch_sparse_tensor from ..utils import add_reverse_edges @@ -181,8 +181,15 @@ class HIN_NodeClassification(NodeClassificationDataset): def __init__(self, dataset_name, *args, **kwargs): super(HIN_NodeClassification, self).__init__(*args, **kwargs) + + if 'args' in kwargs: + self.args = kwargs['args'] + else: + self.args = None + self.g, self.category, self.num_classes = self.load_HIN(dataset_name) + def load_HIN(self, name_dataset): if name_dataset == 'demo_graph': data_path = './openhgnn/dataset/demo_graph.bin' @@ -211,7 +218,12 @@ def load_HIN(self, name_dataset): self.in_dim = g.ndata['h'][category].shape[1] elif name_dataset == 'imdb4MAGNN': - dataset = AcademicDataset(name='imdb4MAGNN', raw_dir='') + + if self.args.use_database == True: + dataset = IMDB4MAGNN_Dataset(name='imdb4MAGNN',args = self.args) + else: + dataset = AcademicDataset(name='imdb4MAGNN', raw_dir='') + category = 'M' g = dataset[0].long() num_classes = 3 diff --git a/openhgnn/dataset/__init__.py b/openhgnn/dataset/__init__.py index bb241df0..4b5553a5 100644 --- a/openhgnn/dataset/__init__.py +++ b/openhgnn/dataset/__init__.py @@ -2,7 +2,7 @@ from dgl.data import DGLDataset from .base_dataset import BaseDataset from .utils import load_acm, load_acm_raw, generate_random_hg -from .academic_graph import AcademicDataset +from .academic_graph import AcademicDataset,IMDB4MAGNN_Dataset from .hgb_dataset import HGBDataset from .ohgb_dataset import OHGBDataset from .gtn_dataset import * @@ -115,6 +115,10 @@ def build_dataset(dataset, task, *args, **kwargs): 'Book-Crossing', 'amazon4SLICE', 'MTWM', 'HNE-PubMed', 'HGBl-ACM', 'HGBl-DBLP', 'HGBl-IMDB', 'amazon', 'yelp4HGSL']: _dataset = 'hin_' + task + elif dataset in ['imdb4MAGNN']: + _dataset = 'hin_' + task + return DATASET_REGISTRY[_dataset](dataset, logger=kwargs['logger'], + args = kwargs['args'] ) elif dataset in ohgbn_datasets + ohgbl_datasets: _dataset = 'ohgb_' + task elif dataset in ['ogbn-mag']: diff --git a/openhgnn/dataset/academic_graph.py b/openhgnn/dataset/academic_graph.py index 48c31245..a4b149a0 100644 --- a/openhgnn/dataset/academic_graph.py +++ b/openhgnn/dataset/academic_graph.py @@ -9,6 +9,130 @@ import torch as th + +# get dataset from database +class IMDB4MAGNN_Dataset(DGLDataset): + + def __init__(self, name, args, raw_dir=None, force_reload=False, verbose=True): + assert name in ['imdb4MAGNN', ] + + self.args = args + super(IMDB4MAGNN_Dataset, self).__init__(name=name, + url=None, + raw_dir=None, + force_reload=force_reload, + verbose=verbose) + + + def download(self): + + from gdbi import NodeExportConfig, EdgeExportConfig, Neo4jInterface, NebulaInterface + node_export_config = [ + NodeExportConfig('A', ['attribute'] ), + NodeExportConfig('M', ['attribute'], ['label']), + NodeExportConfig('D', ['attribute']) + ] + + edge_export_config = [ + EdgeExportConfig('A_M', ('A','M')), + EdgeExportConfig('M_A', ('M','A')), + EdgeExportConfig('M_D', ('M','D')), + EdgeExportConfig('D_M', ('D','M')) + ] + + # neo4j + graph_database = Neo4jInterface() + + # # nebula + # graph_database = NebulaInterface() + + graph_address = self.args.graph_address + user_name = self.args.user_name + password = self.args.password + + conn = graph_database.GraphDBConnection(graph_address, user_name, password) + self.graph = graph_database.get_graph(conn, 'imdb4MAGNN', node_export_config, edge_export_config) + + + + + def process(self): + + graph = self.graph + cano_edges = {} + for edge_type in graph['edge_index_dict'].keys(): # 'A_M' + src_type = edge_type[0] # A + dst_type = edge_type[-1] # M + edge_type_2 = src_type + '-' + dst_type # A-M + + cano_edge_type = (src_type,edge_type_2,dst_type) # ('A','A-M','M') + u,v = graph['edge_index_dict'][edge_type][0] ,graph['edge_index_dict'][edge_type][1] + + cano_edges[cano_edge_type] = (u,v) + + + + hg = dgl.heterograph(cano_edges) + + for node_type in graph['X_dict'].keys() : + hg.nodes[node_type].data['h'] = graph['X_dict'][node_type] + if node_type == 'M': + hg.nodes[node_type].data['labels'] = graph['Y_dict'][node_type] + + import torch + + + + num_nodes = 4278 + random_indices = torch.randperm(num_nodes) + + num_train = 400 + num_val = 400 + num_test = 3478 + + train_mask = torch.zeros(num_nodes, dtype=torch.int) + train_mask[random_indices[:num_train]] = 1 + val_mask = torch.zeros(num_nodes, dtype=torch.int) + val_mask[random_indices[num_train:num_train+num_val]] = 1 + test_mask = torch.zeros(num_nodes, dtype=torch.int) + test_mask[random_indices[num_train+num_val:]] = 1 + + assert torch.sum(train_mask * val_mask) == 0 + assert torch.sum(train_mask * test_mask) == 0 + assert torch.sum(val_mask * test_mask) == 0 + + hg.nodes['M'].data['train_mask'] = train_mask + hg.nodes['M'].data['val_mask'] = val_mask + hg.nodes['M'].data['test_mask'] = test_mask + + self._g = hg + + + + + def __getitem__(self, idx): + # get one example by index + assert idx == 0, "This dataset has only one graph" + return self._g + + def __len__(self): + return 1 + + + def save(self): + pass + + def load(self): + pass + + def has_cache(self): + pass + + + + + + class AcademicDataset(DGLDataset): _prefix = 'https://s3.cn-north-1.amazonaws.com.cn/dgl-data/' diff --git a/openhgnn/experiment.py b/openhgnn/experiment.py index 075ef381..efe84d27 100644 --- a/openhgnn/experiment.py +++ b/openhgnn/experiment.py @@ -85,6 +85,7 @@ def __init__(self, model, dataset, task, hpo_trials: int = 100, output_dir: str = "./openhgnn/output", conf_path: str = default_conf_path, + use_database:bool = False, **kwargs): self.config = Config(file_path=conf_path, model=model, dataset=dataset, task=task, gpu=gpu) self.config.model = model @@ -92,6 +93,7 @@ def __init__(self, model, dataset, task, self.config.task = task self.config.gpu = gpu self.config.use_best_config = use_best_config + self.config.use_database = use_database # self.config.use_hpo = use_hpo self.config.load_from_pretrained = load_from_pretrained self.config.output_dir = os.path.join(output_dir, self.config.model_name) diff --git a/openhgnn/tasks/node_classification.py b/openhgnn/tasks/node_classification.py index 3565aae0..4352edb4 100644 --- a/openhgnn/tasks/node_classification.py +++ b/openhgnn/tasks/node_classification.py @@ -31,7 +31,8 @@ class NodeClassification(BaseTask): def __init__(self, args): super(NodeClassification, self).__init__() self.logger = args.logger - self.dataset = build_dataset(args.dataset, 'node_classification', logger=self.logger) + self.dataset = build_dataset(args.dataset, 'node_classification', + logger=self.logger,args = args) # self.evaluator = Evaluator() self.logger = args.logger if hasattr(args, 'validation'):