Skip to content

Commit

Permalink
Merge pull request #111 from microbiomedata/schema-pattern-linting
Browse files Browse the repository at this point in the history
schema pattern linting
  • Loading branch information
turbomam committed Mar 29, 2024
2 parents c49823b + 3dd5e90 commit 5026c97
Show file tree
Hide file tree
Showing 5 changed files with 437 additions and 0 deletions.
115 changes: 115 additions & 0 deletions assets/check_examples_class_coverage.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
Types asserted in any src/data/valid YAML file:

{'nmdc:Biosample': 18,
'nmdc:ChemicalConversionProcess': 2,
'nmdc:ChemicalEntity': 2,
'nmdc:ChromatographicSeparationProcess': 3,
'nmdc:CollectingBiosamplesFromSite': 1,
'nmdc:ControlledIdentifiedTermValue': 18,
'nmdc:ControlledTermValue': 1,
'nmdc:DataObject': 10,
'nmdc:DissolvingProcess': 1,
'nmdc:Doi': 4,
'nmdc:Extraction': 9,
'nmdc:FieldResearchSite': 1,
'nmdc:FiltrationProcess': 1,
'nmdc:FunctionalAnnotation': 3,
'nmdc:FunctionalAnnotationAggMember': 2,
'nmdc:GeneProduct': 1,
'nmdc:GeolocationValue': 3,
'nmdc:ImageValue': 2,
'nmdc:Instrument': 2,
'nmdc:LibraryPreparation': 4,
'nmdc:MagBin': 1,
'nmdc:MagsAnalysis': 1,
'nmdc:MassSpectrometry': 1,
'nmdc:MetabolomicsAnalysis': 2,
'nmdc:MetagenomeAssembly': 2,
'nmdc:MetagenomeSequencing': 2,
'nmdc:MixingProcess': 1,
'nmdc:NomAnalysis': 1,
'nmdc:NucleotideSequencing': 8,
'nmdc:OntologyClass': 18,
'nmdc:PersonValue': 7,
'nmdc:PlaceholderClass': 1,
'nmdc:Pooling': 7,
'nmdc:ProcessedSample': 5,
'nmdc:Protocol': 3,
'nmdc:ProtocolExecution': 2,
'nmdc:QuantityValue': 19,
'nmdc:ReadQcAnalysis': 4,
'nmdc:Solution': 8,
'nmdc:SolutionComponent': 9,
'nmdc:Study': 10,
'nmdc:SubSamplingProcess': 1,
'nmdc:TextValue': 3,
'nmdc:TimestampValue': 1,
'nmdc:WorkflowChain': 3,
'prov:Association': 2}


Classes that are not instantiated in any src/data/valid YAML file:

{'abstract': {'DataGeneration': {'declared_uri': 'nmdc:DataGeneration',
'native_uri': 'nmdc:DataGeneration'},
'FluidHandling': {'declared_uri': 'nmdc:FluidHandling',
'native_uri': 'nmdc:FluidHandling'},
'FunctionalAnnotationTerm': {'declared_uri': 'nmdc:FunctionalAnnotationTerm',
'native_uri': 'nmdc:FunctionalAnnotationTerm'},
'MaterialEntity': {'declared_uri': 'nmdc:MaterialEntity',
'native_uri': 'nmdc:MaterialEntity'},
'NamedThing': {'declared_uri': 'nmdc:NamedThing',
'native_uri': 'nmdc:NamedThing'},
'PlannedProcess': {'declared_uri': 'OBI:0000011',
'native_uri': 'nmdc:PlannedProcess'},
'Site': {'declared_uri': 'nmdc:Site', 'native_uri': 'nmdc:Site'}},
'concrete': {'AttributeValue': {'declared_uri': 'nmdc:AttributeValue',
'native_uri': 'nmdc:AttributeValue'},
'BooleanValue': {'declared_uri': 'nmdc:BooleanValue',
'native_uri': 'nmdc:BooleanValue'},
'Database': {'declared_uri': 'nmdc:Database',
'native_uri': 'nmdc:Database'},
'DirectInfusionProcess': {'declared_uri': 'nmdc:DirectInfusionProcess',
'native_uri': 'nmdc:DirectInfusionProcess'},
'EnvironmentalMaterialTerm': {'declared_uri': 'nmdc:EnvironmentalMaterialTerm',
'native_uri': 'nmdc:EnvironmentalMaterialTerm'},
'FailureCategorization': {'declared_uri': 'nmdc:FailureCategorization',
'native_uri': 'nmdc:FailureCategorization'},
'GenomeFeature': {'declared_uri': 'nmdc:GenomeFeature',
'native_uri': 'nmdc:GenomeFeature'},
'IntegerValue': {'declared_uri': 'nmdc:IntegerValue',
'native_uri': 'nmdc:IntegerValue'},
'MaterialProcessing': {'declared_uri': 'nmdc:MaterialProcessing',
'native_uri': 'nmdc:MaterialProcessing'},
'MetaboliteQuantification': {'declared_uri': 'nmdc:MetaboliteQuantification',
'native_uri': 'nmdc:MetaboliteQuantification'},
'MetagenomeAnnotation': {'declared_uri': 'nmdc:MetagenomeAnnotation',
'native_uri': 'nmdc:MetagenomeAnnotation'},
'MetaproteomicsAnalysis': {'declared_uri': 'nmdc:MetaproteomicsAnalysis',
'native_uri': 'nmdc:MetaproteomicsAnalysis'},
'MetatranscriptomeAnalysis': {'declared_uri': 'nmdc:MetatranscriptomeAnalysis',
'native_uri': 'nmdc:MetatranscriptomeAnalysis'},
'MetatranscriptomeAnnotation': {'declared_uri': 'nmdc:MetatranscriptomeAnnotation',
'native_uri': 'nmdc:MetatranscriptomeAnnotation'},
'MetatranscriptomeAssembly': {'declared_uri': 'nmdc:MetatranscriptomeAssembly',
'native_uri': 'nmdc:MetatranscriptomeAssembly'},
'OrthologyGroup': {'declared_uri': 'nmdc:OrthologyGroup',
'native_uri': 'nmdc:OrthologyGroup'},
'Pathway': {'declared_uri': 'nmdc:Pathway',
'native_uri': 'nmdc:Pathway'},
'PeptideQuantification': {'declared_uri': 'nmdc:PeptideQuantification',
'native_uri': 'nmdc:PeptideQuantification'},
'ProteinQuantification': {'declared_uri': 'nmdc:ProteinQuantification',
'native_uri': 'nmdc:ProteinQuantification'},
'Reaction': {'declared_uri': 'nmdc:Reaction',
'native_uri': 'nmdc:Reaction'},
'ReactionParticipant': {'declared_uri': 'nmdc:ReactionParticipant',
'native_uri': 'nmdc:ReactionParticipant'},
'ReadBasedTaxonomyAnalysis': {'declared_uri': 'nmdc:ReadBasedTaxonomyAnalysis',
'native_uri': 'nmdc:ReadBasedTaxonomyAnalysis'},
'Substance': {'declared_uri': 'nmdc:Substance',
'native_uri': 'nmdc:Substance'},
'UrlValue': {'declared_uri': 'nmdc:UrlValue',
'native_uri': 'nmdc:UrlValue'},
'WorkflowExecution': {'declared_uri': 'nmdc:WorkflowExecution',
'native_uri': 'nmdc:WorkflowExecution'}}}
150 changes: 150 additions & 0 deletions assets/schema_pattern_linting.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
Report of slots that aren't associated with any classes:

No classes for parent slot analysis_identifiers
No classes for parent slot assembly_identifiers
No classes for slot biogas_retention_time
No classes for slot biogas_temperature
No classes for parent slot biosample_identifiers
No classes for slot completion_date
No classes for slot date_created
No classes for slot ecosystem_path_id
No classes for parent slot emsl_identifiers
No classes for slot emsl_store_temp
No classes for slot etl_software_version
No classes for parent slot gff_coordinate
No classes for parent slot gnps_identifiers
No classes for parent slot gold_identifiers
No classes for parent slot gold_path_field
No classes for parent slot has_participants
No classes for parent slot igsn_identifiers
No classes for slot input_volume
No classes for slot insdc_analysis_identifiers
No classes for parent slot insdc_identifiers
No classes for slot insdc_secondary_sample_identifiers
No classes for slot insdc_sra_ena_study_identifiers
No classes for parent slot jgi_portal_identifiers
No classes for slot material_component_separation
No classes for parent slot metagenome_assembly_parameter
No classes for slot mgnify_analysis_identifiers
No classes for parent slot mgnify_identifiers
No classes for slot modifier_substance
No classes for parent slot neon_identifiers
No classes for slot nucl_acid_ext, MIXS:0000037
No classes for parent slot object_set
No classes for parent slot omics_processing_identifiers
No classes for slot omics_type
No classes for slot pool_dna_extracts, MIXS:0000325
No classes for parent slot read_qc_analysis_statistic
No classes for slot samp_vol_we_dna_ext, MIXS:0000111
No classes for slot sample_collection_day
No classes for slot sample_collection_hour
No classes for slot sample_collection_minute
No classes for slot sample_collection_month
No classes for slot sample_collection_year
No classes for slot seq_meth, MIXS:0000050
No classes for slot seq_quality_check, MIXS:0000051
No classes for slot soil_annual_season_temp
No classes for parent slot study_identifiers
No classes for slot value


Report of enums that aren't associated with any slots:

No slots for ProteolyticEnzymeEnum
No slots for freq_clean_enum
No slots for organism_count_enum
No slots for plant_growth_med_enum
No slots for samp_md_enum


Report of all types. Manual review recommended:

['bytes',
'decimal degree',
'language code',
'unit',
'string',
'integer',
'boolean',
'float',
'double',
'decimal',
'time',
'date',
'datetime',
'date_or_datetime',
'uriorcurie',
'curie',
'uri',
'ncname',
'objectidentifier',
'nodeidentifier',
'jsonpointer',
'jsonpath',
'sparqlpath',
'external_identifier']


Report of all elements whose names contain whitespace:

name: decimal degree
description: A decimal degree expresses latitude or longitude as decimal fractions.
from_schema: https://w3id.org/nmdc/core
see_also:
- https://en.wikipedia.org/wiki/Decimal_degrees
base: float
uri: xsd:decimal

name: language code
description: A language code conforming to ISO_639-1
from_schema: https://w3id.org/nmdc/core
see_also:
- https://en.wikipedia.org/wiki/ISO_639-1
base: str
uri: xsd:language

name: nucleic acid sequence source
from_schema: https://w3id.org/nmdc/core

name: workflow subset
description: Subset consisting of just the workflow execution activities
from_schema: https://w3id.org/nmdc/workflow_execution_activity

name: sample subset
description: Subset consisting of entities linked to the processing of samples. Currently,
this subset consists of study, omics process, and biosample.
from_schema: https://w3id.org/nmdc/nmdc

name: data object subset
description: Subset consisting of the data objects that either inputs or outputs of
processes or workflows.
from_schema: https://w3id.org/nmdc/nmdc



Report of subsets usage:

{'data object subset': ['DataObject'],
'data_portal_subset': ['associated_dois',
'doi_value',
'doi_provider',
'doi_category'],
'environment': [],
'investigation': [],
'nucleic acid sequence source': [],
'proteases': [],
'sample subset': ['Biosample', 'Study', 'DataGeneration'],
'sequencing': [],
'workflow subset': ['WorkflowExecution',
'MetagenomeAssembly',
'MetatranscriptomeAssembly',
'MetagenomeAnnotation',
'MetatranscriptomeAnnotation',
'MetatranscriptomeAnalysis',
'MagsAnalysis',
'MetagenomeSequencing',
'ReadQcAnalysis',
'ReadBasedTaxonomyAnalysis',
'MetabolomicsAnalysis',
'MetaproteomicsAnalysis',
'NomAnalysis']}
6 changes: 6 additions & 0 deletions project.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -544,3 +544,9 @@ local/mongo_as_nmdc_database_cuire_repaired_stamped.ttl: local/mongo_as_nmdc_dat
$(RUN) python src/scripts/date_created_blank_node.py > local/date_created_blank_node.ttl
cat $^ local/date_created_blank_node.ttl > $@
rm local/date_created_blank_node.ttl

assets/check_examples_class_coverage.txt:
$(RUN) python src/scripts/check_examples_class_coverage.py > $@

assets/schema_pattern_linting.txt:
$(RUN) python src/scripts/schema_pattern_linting.py > $@
92 changes: 92 additions & 0 deletions src/scripts/check_examples_class_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import os
import pprint

import yaml
from linkml_runtime import SchemaView

# set the directory path here (relative path)
source_directory = "src/data/valid"

schema_file = 'src/schema/nmdc.yaml'

schema_view = SchemaView(schema_file)


def find_type_keys(data):
"""
Recursively searches for all "type" key values within a YAML structure.
Args:
data: The YAML data structure (dict or list).
Returns:
A dictionary where keys are type values and values are their counts.
"""
tc = {}
if isinstance(data, dict):
for key, value in data.items():
if key == "type":
tc[value] = tc.get(value, 0) + 1 # Increment count
else:
tc.update(find_type_keys(value)) # Recursively call on values (objects or lists)
elif isinstance(data, list):
for item in data:
tc.update(find_type_keys(item)) # Recursively call on list items
return tc


def process_all_files(directory):
"""
Processes all YAML files in a directory, finding all "type" key values and accumulating counts in a dictionary.
Args:
directory: The directory containing YAML files.
Returns:
A dictionary where keys are type values from all files and values are their total counts.
"""
tc = {}
for filename in os.listdir(directory):
if filename.endswith(".yaml"):
filepath = os.path.join(directory, filename)
with open(filepath, 'r') as f:
data = yaml.safe_load(f)
file_tc = find_type_keys(data)

def accumulate_counts(target_dict, source_dict):
for key, value in source_dict.items():
target_dict[key] = target_dict.get(key, 0) + value

accumulate_counts(tc, file_tc) # Accumulate counts from each file
return tc


# Process all files and get the combined type counts
type_counts = process_all_files(source_directory)

print("Types asserted in any src/data/valid YAML file:")
print("")
pprint.pprint(type_counts)
print("\n")

schema_classes = schema_view.all_classes()

available_classes = {}
for ck, cv in schema_classes.items():
available_classes[ck] = {
"declared_uri": schema_view.get_uri(cv, native=False),
"native_uri": schema_view.get_uri(cv, native=True),
}

lacks_example = {"abstract": {}, "concrete": {}}
for k, v in available_classes.items():
if v["declared_uri"] not in type_counts and v["native_uri"] not in type_counts:

if schema_view.get_class(k).abstract:
lacks_example["abstract"][k] = v
else:
lacks_example["concrete"][k] = v

print("Classes that are not instantiated in any src/data/valid YAML file:")
print("")
pprint.pprint(lacks_example)
Loading

0 comments on commit 5026c97

Please sign in to comment.