From 12f3bfd7427c60b34899fdbb5833b196c5c9ea75 Mon Sep 17 00:00:00 2001 From: Corey Cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Wed, 19 Apr 2023 15:19:09 -0500 Subject: [PATCH] Drop duplicate edges by id, report in QC --- cat_merge/merge.py | 11 +++++--- cat_merge/merge_utils.py | 19 +++++++++---- cat_merge/model/merged_kg.py | 7 +++-- cat_merge/qc_utils.py | 10 ++++--- tests/conftest.py | 10 +++++-- tests/integration/test_merge.py | 28 +++++++++---------- .../test_create_qc_report_expected_dict.yaml | 1 + .../test_create_qc_report_expected_list.yaml | 1 + tests/unit/qc_utils/test_create_qc_report.py | 18 ++++++------ ...e_rows.py => test_get_duplicates_by_id.py} | 9 +++--- 10 files changed, 69 insertions(+), 45 deletions(-) rename tests/unit/{test_get_duplicate_rows.py => test_get_duplicates_by_id.py} (59%) diff --git a/cat_merge/merge.py b/cat_merge/merge.py index 1de07be..29c46c2 100644 --- a/cat_merge/merge.py +++ b/cat_merge/merge.py @@ -1,14 +1,17 @@ +import os import tarfile +from typing import List import yaml import logging -from cat_merge.file_utils import * -from cat_merge.merge_utils import * +from cat_merge.file_utils import read_dfs, read_tar_dfs, get_files, write +from cat_merge.merge_utils import merge_kg from cat_merge.qc_utils import create_qc_report log = logging.getLogger(__name__) + def merge( name: str = "merged-kg", source: str = None, # Optional directory or tar archive containing node and edge files @@ -57,7 +60,7 @@ def merge( mapping_dfs = read_dfs(mappings, add_source_col=None) print("Merging...") - kg = merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping_dfs=mapping_dfs, merge_delimiter=merge_delimiter) + kg, qc = merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping_dfs=mapping_dfs, merge_delimiter=merge_delimiter) write( name=name, kg=kg, @@ -66,7 +69,7 @@ def merge( if qc_report: print("Generating QC report") - qc_report = create_qc_report(kg) + qc_report = create_qc_report(kg, qc) with open(f"{output_dir}/qc_report.yaml", "w") as report_file: yaml.dump(qc_report, report_file) diff --git a/cat_merge/merge_utils.py b/cat_merge/merge_utils.py index 57576f1..7a069b6 100644 --- a/cat_merge/merge_utils.py +++ b/cat_merge/merge_utils.py @@ -1,15 +1,16 @@ import pandas as pd from pandas.core.frame import DataFrame -from typing import List -from cat_merge.model.merged_kg import MergedKG +from typing import List, Tuple +from cat_merge.model.merged_kg import MergedKG, MergeQC from cat_merge.mapping_utils import apply_mappings import numpy as np + def concat_dataframes(dataframes: List[DataFrame]) -> DataFrame: return pd.concat(dataframes, axis=0) -def get_duplicate_rows(df: DataFrame) -> DataFrame: +def get_duplicates_by_id(df: DataFrame) -> DataFrame: return df[df.id.duplicated(keep=False)] @@ -19,6 +20,7 @@ def clean_nodes(nodes: DataFrame, merge_delimiter: str = " ") -> DataFrame: def clean_edges(edges: DataFrame, nodes: DataFrame) -> DataFrame: + edges = edges.drop_duplicates(subset=['id'], keep=False) return edges[edges.subject.isin(nodes.id) & edges.object.isin(nodes.id)] @@ -27,7 +29,10 @@ def get_dangling_edges(edges: DataFrame, nodes: DataFrame) -> DataFrame: return dangling_edges -def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping_dfs: List[DataFrame] = None, merge_delimiter: str = "|") -> MergedKG: +def merge_kg(edge_dfs: List[DataFrame], + node_dfs: List[DataFrame], + mapping_dfs: List[DataFrame] = None, + merge_delimiter: str = "|") -> tuple[MergedKG, MergeQC]: all_nodes = concat_dataframes(node_dfs) all_nodes = all_nodes.fillna(np.nan).replace([np.nan], [None]) all_edges = concat_dataframes(edge_dfs) @@ -37,10 +42,12 @@ def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping_dfs: mapping_df = concat_dataframes(mapping_dfs) all_edges = apply_mappings(all_edges, mapping_df) - duplicate_nodes = get_duplicate_rows(df=all_nodes) + duplicate_nodes = get_duplicates_by_id(df=all_nodes) + duplicate_edges = get_duplicates_by_id(df=all_edges) dangling_edges = get_dangling_edges(edges=all_edges, nodes=all_nodes) nodes = clean_nodes(nodes=all_nodes, merge_delimiter=merge_delimiter) edges = clean_edges(edges=all_edges, nodes=nodes) - return MergedKG(nodes=nodes, edges=edges, duplicate_nodes=duplicate_nodes, dangling_edges=dangling_edges) + return MergedKG(nodes=nodes, edges=edges), \ + MergeQC(duplicate_nodes=duplicate_nodes, duplicate_edges=duplicate_edges, dangling_edges=dangling_edges) diff --git a/cat_merge/model/merged_kg.py b/cat_merge/model/merged_kg.py index d8364a3..4ac2413 100644 --- a/cat_merge/model/merged_kg.py +++ b/cat_merge/model/merged_kg.py @@ -5,10 +5,13 @@ class MergedKG: def __init__(self, nodes: DataFrame, edges: DataFrame, - duplicate_nodes: DataFrame, - dangling_edges: DataFrame ): self.nodes = nodes self.edges = edges + + +class MergeQC: + def __init__(self, duplicate_nodes: DataFrame, duplicate_edges: DataFrame, dangling_edges: DataFrame): self.duplicate_nodes = duplicate_nodes + self.duplicate_edges = duplicate_edges self.dangling_edges = dangling_edges diff --git a/cat_merge/qc_utils.py b/cat_merge/qc_utils.py index effbccb..6a9380f 100644 --- a/cat_merge/qc_utils.py +++ b/cat_merge/qc_utils.py @@ -1,7 +1,7 @@ import pandas as pd # from grape import Graph # type: ignore -from cat_merge.model.merged_kg import MergedKG +from cat_merge.model.merged_kg import MergedKG, MergeQC from typing import Dict, List, Union @@ -204,10 +204,11 @@ def get_difference(a: Union[List, pd.Series], b: Union[List, pd.Series]) -> Unio return s if type(a) is list else pd.Series(s, dtype=a.dtype, name=a.name) -def create_qc_report(kg: MergedKG, data_type: type = dict, group_by: str = "provided_by") -> Dict: +def create_qc_report(kg: MergedKG, qc: MergeQC, data_type: type = dict, group_by: str = "provided_by") -> Dict: """ interface for generating qc report from merged kg :param kg: a MergeKG with data to create QC report + :param qc: a MergeQC with qc data to create QC report :param data_type: str indicating mode for qc report generation :param group_by: str indicating which field for qc report grouping :return: a dictionary representing the QC report @@ -216,9 +217,10 @@ def create_qc_report(kg: MergedKG, data_type: type = dict, group_by: str = "prov nodes = cols_fill_na(kg.nodes, {'in_taxon': 'missing taxon', 'category': 'missing category'}) ingest_collection = { 'nodes': create_nodes_report(nodes, data_type, group_by), - 'duplicate_nodes': create_nodes_report(kg.duplicate_nodes, data_type, group_by), + 'duplicate_nodes': create_nodes_report(qc.duplicate_nodes, data_type, group_by), 'edges': create_edges_report(kg.edges, nodes, data_type, group_by), - 'dangling_edges': create_edges_report(kg.dangling_edges, nodes, data_type, group_by) + 'dangling_edges': create_edges_report(qc.dangling_edges, nodes, data_type, group_by), + 'duplicate_edges': create_edges_report(qc.duplicate_edges, nodes, data_type, group_by) } return ingest_collection diff --git a/tests/conftest.py b/tests/conftest.py index e8b44e4..5789da7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,5 @@ import pytest -from cat_merge.model.merged_kg import MergedKG +from cat_merge.model.merged_kg import MergedKG, MergeQC from cat_merge.file_utils import read_kg from tests.fixtures.edges import kg_edges_1, kg_report_edges_1 from tests.fixtures.nodes import kg_nodes_1, kg_report_nodes_1 @@ -7,5 +7,11 @@ @pytest.fixture def kg_1(kg_report_edges_1, kg_report_nodes_1) -> MergedKG: - kg = MergedKG(nodes=kg_report_nodes_1, edges=kg_report_edges_1, duplicate_nodes=[], dangling_edges=[]) + kg = MergedKG(nodes=kg_report_nodes_1, edges=kg_report_edges_1) return kg + + +@pytest.fixture +def empty_qc(kg_1) -> MergeQC: + qc = MergeQC(duplicate_nodes=[], dangling_edges=[], duplicate_edges=[]) + return qc diff --git a/tests/integration/test_merge.py b/tests/integration/test_merge.py index 224107b..6e55471 100644 --- a/tests/integration/test_merge.py +++ b/tests/integration/test_merge.py @@ -49,19 +49,19 @@ def nodes_and_edges() -> Tuple[List[DataFrame], List[DataFrame]]: g2p_edges = u"""\ id subject object - uuid:1 Gene:1 Pheno:1 - uuid:2 Gene:2 Pheno:2 - uuid:3 Gene:5 Pheno:5 - uuid:4 Gene:5 Pheno:1 + uuid:5 Gene:1 Pheno:1 + uuid:6 Gene:2 Pheno:2 + uuid:7 Gene:5 Pheno:5 + uuid:8 Gene:5 Pheno:1 """ edges.append(string_df(g2p_edges)) d2p_edges = u"""\ id subject object - uuid:1 Disease:1 Pheno:2 - uuid:2 Disease:2 Pheno:4 - uuid:3 Disease:1 Pheno:5 - uuid:4 Disease:5 Pheno:1 + uuid:9 Disease:1 Pheno:2 + uuid:10 Disease:2 Pheno:4 + uuid:11 Disease:1 Pheno:5 + uuid:12 Disease:5 Pheno:1 """ edges.append(string_df(d2p_edges)) @@ -69,20 +69,20 @@ def nodes_and_edges() -> Tuple[List[DataFrame], List[DataFrame]]: def test_merge_kg_node_count(nodes_and_edges): - kg = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1]) + kg, qc = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1]) assert(len(kg.nodes) == 12) def test_merge_kg_edge_count(nodes_and_edges): - kg = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1]) + kg, qc = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1]) assert(len(kg.edges) == 8) def test_merge_kg_dangling_edge_count(nodes_and_edges): - kg = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1]) - assert(len(kg.dangling_edges) == 4) + kg, qc = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1]) + assert(len(qc.dangling_edges) == 4) def test_merge_kg_duplicate_node_count(nodes_and_edges): - kg = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1]) - assert(len(kg.duplicate_nodes) == 2) + kg, qc = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1]) + assert(len(qc.duplicate_nodes) == 2) diff --git a/tests/test_data/expected/test_create_qc_report_expected_dict.yaml b/tests/test_data/expected/test_create_qc_report_expected_dict.yaml index ea4315c..5dffa0c 100644 --- a/tests/test_data/expected/test_create_qc_report_expected_dict.yaml +++ b/tests/test_data/expected/test_create_qc_report_expected_dict.yaml @@ -277,3 +277,4 @@ edges: taxon: - NCBITaxon:9606 dangling_edges: {} +duplicate_edges: {} diff --git a/tests/test_data/expected/test_create_qc_report_expected_list.yaml b/tests/test_data/expected/test_create_qc_report_expected_list.yaml index 1e63246..5b44e21 100644 --- a/tests/test_data/expected/test_create_qc_report_expected_list.yaml +++ b/tests/test_data/expected/test_create_qc_report_expected_list.yaml @@ -249,3 +249,4 @@ edges: taxon: - NCBITaxon:9606 dangling_edges: [] +duplicate_edges: [] diff --git a/tests/unit/qc_utils/test_create_qc_report.py b/tests/unit/qc_utils/test_create_qc_report.py index e4ff1e9..0525219 100644 --- a/tests/unit/qc_utils/test_create_qc_report.py +++ b/tests/unit/qc_utils/test_create_qc_report.py @@ -21,28 +21,28 @@ def qc_report_expected_dict() -> Dict: return report_values -def test_create_qc_report_defaults(kg_1, qc_report_expected_dict): - test_report = create_qc_report(kg_1) +def test_create_qc_report_defaults(kg_1, empty_qc, qc_report_expected_dict): + test_report = create_qc_report(kg_1, empty_qc) assert type(test_report) is dict - assert len(test_report) == 4 + assert len(test_report) == 5 check_report_data(test_report.keys(), qc_report_expected_dict.keys()) check_report_data(test_report.values(), qc_report_expected_dict.values()) -def test_create_qc_report_list(kg_1, qc_report_expected_list): - test_report = create_qc_report(kg_1, data_type=list) +def test_create_qc_report_list(kg_1, empty_qc, qc_report_expected_list): + test_report = create_qc_report(kg_1, empty_qc, data_type=list) assert type(test_report) is dict - assert len(test_report) == 4 + assert len(test_report) == 5 check_report_data(test_report.keys(), qc_report_expected_list.keys()) check_report_data(test_report.values(), qc_report_expected_list.values()) -def test_create_qc_report_dict(kg_1, qc_report_expected_dict): - test_report = create_qc_report(kg_1, data_type=dict) +def test_create_qc_report_dict(kg_1, empty_qc, qc_report_expected_dict): + test_report = create_qc_report(kg_1, empty_qc, data_type=dict) assert type(test_report) is dict - assert len(test_report) == 4 + assert len(test_report) == 5 check_report_data(test_report.keys(), qc_report_expected_dict.keys()) check_report_data(test_report.values(), qc_report_expected_dict.values()) diff --git a/tests/unit/test_get_duplicate_rows.py b/tests/unit/test_get_duplicates_by_id.py similarity index 59% rename from tests/unit/test_get_duplicate_rows.py rename to tests/unit/test_get_duplicates_by_id.py index 7054684..eff2c75 100644 --- a/tests/unit/test_get_duplicate_rows.py +++ b/tests/unit/test_get_duplicates_by_id.py @@ -1,6 +1,6 @@ import pytest from tests.test_utils import string_df -from cat_merge.merge_utils import get_duplicate_rows +from cat_merge.merge_utils import get_duplicates_by_id from pandas.core.frame import DataFrame @@ -10,15 +10,16 @@ def dataframe_with_duplicates() -> DataFrame: id category Gene:1 Gene Gene:2 Gene - Gene:2 Gene + Gene:2 Gene_2 Gene:3 Gene """ return string_df(A) -def test_get_duplicate_row(dataframe_with_duplicates): - df = get_duplicate_rows(dataframe_with_duplicates) +def test_get_duplicates_by_id(dataframe_with_duplicates): + df = get_duplicates_by_id(dataframe_with_duplicates) assert(len(df) == 2) assert(list(df.id) == ["Gene:2", "Gene:2"]) + assert(list(df.category) == ["Gene", "Gene_2"])