Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial duplicate edges fix #50

Merged
merged 1 commit into from
Apr 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions cat_merge/merge.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import os
import tarfile
from typing import List

import yaml
import logging

from cat_merge.file_utils import *
from cat_merge.merge_utils import *
from cat_merge.file_utils import read_dfs, read_tar_dfs, get_files, write
from cat_merge.merge_utils import merge_kg
from cat_merge.qc_utils import create_qc_report

log = logging.getLogger(__name__)


def merge(
name: str = "merged-kg",
source: str = None, # Optional directory or tar archive containing node and edge files
Expand Down Expand Up @@ -57,7 +60,7 @@ def merge(
mapping_dfs = read_dfs(mappings, add_source_col=None)

print("Merging...")
kg = merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping_dfs=mapping_dfs, merge_delimiter=merge_delimiter)
kg, qc = merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping_dfs=mapping_dfs, merge_delimiter=merge_delimiter)
write(
name=name,
kg=kg,
Expand All @@ -66,7 +69,7 @@ def merge(

if qc_report:
print("Generating QC report")
qc_report = create_qc_report(kg)
qc_report = create_qc_report(kg, qc)

with open(f"{output_dir}/qc_report.yaml", "w") as report_file:
yaml.dump(qc_report, report_file)
Expand Down
19 changes: 13 additions & 6 deletions cat_merge/merge_utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import pandas as pd
from pandas.core.frame import DataFrame
from typing import List
from cat_merge.model.merged_kg import MergedKG
from typing import List, Tuple
from cat_merge.model.merged_kg import MergedKG, MergeQC
from cat_merge.mapping_utils import apply_mappings
import numpy as np


def concat_dataframes(dataframes: List[DataFrame]) -> DataFrame:
return pd.concat(dataframes, axis=0)


def get_duplicate_rows(df: DataFrame) -> DataFrame:
def get_duplicates_by_id(df: DataFrame) -> DataFrame:
return df[df.id.duplicated(keep=False)]


Expand All @@ -19,6 +20,7 @@ def clean_nodes(nodes: DataFrame, merge_delimiter: str = " ") -> DataFrame:


def clean_edges(edges: DataFrame, nodes: DataFrame) -> DataFrame:
edges = edges.drop_duplicates(subset=['id'], keep=False)
return edges[edges.subject.isin(nodes.id) & edges.object.isin(nodes.id)]


Expand All @@ -27,7 +29,10 @@ def get_dangling_edges(edges: DataFrame, nodes: DataFrame) -> DataFrame:
return dangling_edges


def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping_dfs: List[DataFrame] = None, merge_delimiter: str = "|") -> MergedKG:
def merge_kg(edge_dfs: List[DataFrame],
node_dfs: List[DataFrame],
mapping_dfs: List[DataFrame] = None,
merge_delimiter: str = "|") -> tuple[MergedKG, MergeQC]:
all_nodes = concat_dataframes(node_dfs)
all_nodes = all_nodes.fillna(np.nan).replace([np.nan], [None])
all_edges = concat_dataframes(edge_dfs)
Expand All @@ -37,10 +42,12 @@ def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping_dfs:
mapping_df = concat_dataframes(mapping_dfs)
all_edges = apply_mappings(all_edges, mapping_df)

duplicate_nodes = get_duplicate_rows(df=all_nodes)
duplicate_nodes = get_duplicates_by_id(df=all_nodes)
duplicate_edges = get_duplicates_by_id(df=all_edges)
dangling_edges = get_dangling_edges(edges=all_edges, nodes=all_nodes)

nodes = clean_nodes(nodes=all_nodes, merge_delimiter=merge_delimiter)
edges = clean_edges(edges=all_edges, nodes=nodes)

return MergedKG(nodes=nodes, edges=edges, duplicate_nodes=duplicate_nodes, dangling_edges=dangling_edges)
return MergedKG(nodes=nodes, edges=edges), \
MergeQC(duplicate_nodes=duplicate_nodes, duplicate_edges=duplicate_edges, dangling_edges=dangling_edges)
7 changes: 5 additions & 2 deletions cat_merge/model/merged_kg.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@ class MergedKG:
def __init__(self,
nodes: DataFrame,
edges: DataFrame,
duplicate_nodes: DataFrame,
dangling_edges: DataFrame
):
self.nodes = nodes
self.edges = edges


class MergeQC:
def __init__(self, duplicate_nodes: DataFrame, duplicate_edges: DataFrame, dangling_edges: DataFrame):
self.duplicate_nodes = duplicate_nodes
self.duplicate_edges = duplicate_edges
self.dangling_edges = dangling_edges
10 changes: 6 additions & 4 deletions cat_merge/qc_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
# from grape import Graph # type: ignore

from cat_merge.model.merged_kg import MergedKG
from cat_merge.model.merged_kg import MergedKG, MergeQC
from typing import Dict, List, Union


Expand Down Expand Up @@ -204,10 +204,11 @@ def get_difference(a: Union[List, pd.Series], b: Union[List, pd.Series]) -> Unio
return s if type(a) is list else pd.Series(s, dtype=a.dtype, name=a.name)


def create_qc_report(kg: MergedKG, data_type: type = dict, group_by: str = "provided_by") -> Dict:
def create_qc_report(kg: MergedKG, qc: MergeQC, data_type: type = dict, group_by: str = "provided_by") -> Dict:
"""
interface for generating qc report from merged kg
:param kg: a MergeKG with data to create QC report
:param qc: a MergeQC with qc data to create QC report
:param data_type: str indicating mode for qc report generation
:param group_by: str indicating which field for qc report grouping
:return: a dictionary representing the QC report
Expand All @@ -216,9 +217,10 @@ def create_qc_report(kg: MergedKG, data_type: type = dict, group_by: str = "prov
nodes = cols_fill_na(kg.nodes, {'in_taxon': 'missing taxon', 'category': 'missing category'})
ingest_collection = {
'nodes': create_nodes_report(nodes, data_type, group_by),
'duplicate_nodes': create_nodes_report(kg.duplicate_nodes, data_type, group_by),
'duplicate_nodes': create_nodes_report(qc.duplicate_nodes, data_type, group_by),
'edges': create_edges_report(kg.edges, nodes, data_type, group_by),
'dangling_edges': create_edges_report(kg.dangling_edges, nodes, data_type, group_by)
'dangling_edges': create_edges_report(qc.dangling_edges, nodes, data_type, group_by),
'duplicate_edges': create_edges_report(qc.duplicate_edges, nodes, data_type, group_by)
}

return ingest_collection
10 changes: 8 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
import pytest
from cat_merge.model.merged_kg import MergedKG
from cat_merge.model.merged_kg import MergedKG, MergeQC
from cat_merge.file_utils import read_kg
from tests.fixtures.edges import kg_edges_1, kg_report_edges_1
from tests.fixtures.nodes import kg_nodes_1, kg_report_nodes_1


@pytest.fixture
def kg_1(kg_report_edges_1, kg_report_nodes_1) -> MergedKG:
kg = MergedKG(nodes=kg_report_nodes_1, edges=kg_report_edges_1, duplicate_nodes=[], dangling_edges=[])
kg = MergedKG(nodes=kg_report_nodes_1, edges=kg_report_edges_1)
return kg


@pytest.fixture
def empty_qc(kg_1) -> MergeQC:
qc = MergeQC(duplicate_nodes=[], dangling_edges=[], duplicate_edges=[])
return qc
28 changes: 14 additions & 14 deletions tests/integration/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,40 +49,40 @@ def nodes_and_edges() -> Tuple[List[DataFrame], List[DataFrame]]:

g2p_edges = u"""\
id subject object
uuid:1 Gene:1 Pheno:1
uuid:2 Gene:2 Pheno:2
uuid:3 Gene:5 Pheno:5
uuid:4 Gene:5 Pheno:1
uuid:5 Gene:1 Pheno:1
uuid:6 Gene:2 Pheno:2
uuid:7 Gene:5 Pheno:5
uuid:8 Gene:5 Pheno:1
"""
edges.append(string_df(g2p_edges))

d2p_edges = u"""\
id subject object
uuid:1 Disease:1 Pheno:2
uuid:2 Disease:2 Pheno:4
uuid:3 Disease:1 Pheno:5
uuid:4 Disease:5 Pheno:1
uuid:9 Disease:1 Pheno:2
uuid:10 Disease:2 Pheno:4
uuid:11 Disease:1 Pheno:5
uuid:12 Disease:5 Pheno:1
"""
edges.append(string_df(d2p_edges))

return nodes, edges


def test_merge_kg_node_count(nodes_and_edges):
kg = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1])
kg, qc = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1])
assert(len(kg.nodes) == 12)


def test_merge_kg_edge_count(nodes_and_edges):
kg = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1])
kg, qc = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1])
assert(len(kg.edges) == 8)


def test_merge_kg_dangling_edge_count(nodes_and_edges):
kg = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1])
assert(len(kg.dangling_edges) == 4)
kg, qc = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1])
assert(len(qc.dangling_edges) == 4)


def test_merge_kg_duplicate_node_count(nodes_and_edges):
kg = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1])
assert(len(kg.duplicate_nodes) == 2)
kg, qc = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1])
assert(len(qc.duplicate_nodes) == 2)
Original file line number Diff line number Diff line change
Expand Up @@ -277,3 +277,4 @@ edges:
taxon:
- NCBITaxon:9606
dangling_edges: {}
duplicate_edges: {}
Original file line number Diff line number Diff line change
Expand Up @@ -249,3 +249,4 @@ edges:
taxon:
- NCBITaxon:9606
dangling_edges: []
duplicate_edges: []
18 changes: 9 additions & 9 deletions tests/unit/qc_utils/test_create_qc_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,28 +21,28 @@ def qc_report_expected_dict() -> Dict:
return report_values


def test_create_qc_report_defaults(kg_1, qc_report_expected_dict):
test_report = create_qc_report(kg_1)
def test_create_qc_report_defaults(kg_1, empty_qc, qc_report_expected_dict):
test_report = create_qc_report(kg_1, empty_qc)

assert type(test_report) is dict
assert len(test_report) == 4
assert len(test_report) == 5
check_report_data(test_report.keys(), qc_report_expected_dict.keys())
check_report_data(test_report.values(), qc_report_expected_dict.values())


def test_create_qc_report_list(kg_1, qc_report_expected_list):
test_report = create_qc_report(kg_1, data_type=list)
def test_create_qc_report_list(kg_1, empty_qc, qc_report_expected_list):
test_report = create_qc_report(kg_1, empty_qc, data_type=list)

assert type(test_report) is dict
assert len(test_report) == 4
assert len(test_report) == 5
check_report_data(test_report.keys(), qc_report_expected_list.keys())
check_report_data(test_report.values(), qc_report_expected_list.values())


def test_create_qc_report_dict(kg_1, qc_report_expected_dict):
test_report = create_qc_report(kg_1, data_type=dict)
def test_create_qc_report_dict(kg_1, empty_qc, qc_report_expected_dict):
test_report = create_qc_report(kg_1, empty_qc, data_type=dict)

assert type(test_report) is dict
assert len(test_report) == 4
assert len(test_report) == 5
check_report_data(test_report.keys(), qc_report_expected_dict.keys())
check_report_data(test_report.values(), qc_report_expected_dict.values())
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest
from tests.test_utils import string_df
from cat_merge.merge_utils import get_duplicate_rows
from cat_merge.merge_utils import get_duplicates_by_id
from pandas.core.frame import DataFrame


Expand All @@ -10,15 +10,16 @@ def dataframe_with_duplicates() -> DataFrame:
id category
Gene:1 Gene
Gene:2 Gene
Gene:2 Gene
Gene:2 Gene_2
Gene:3 Gene
"""

return string_df(A)


def test_get_duplicate_row(dataframe_with_duplicates):
df = get_duplicate_rows(dataframe_with_duplicates)
def test_get_duplicates_by_id(dataframe_with_duplicates):
df = get_duplicates_by_id(dataframe_with_duplicates)
assert(len(df) == 2)
assert(list(df.id) == ["Gene:2", "Gene:2"])
assert(list(df.category) == ["Gene", "Gene_2"])