forked from microbiomedata/nmdc-schema
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #111 from microbiomedata/schema-pattern-linting
schema pattern linting
- Loading branch information
Showing
5 changed files
with
437 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
Types asserted in any src/data/valid YAML file: | ||
|
||
{'nmdc:Biosample': 18, | ||
'nmdc:ChemicalConversionProcess': 2, | ||
'nmdc:ChemicalEntity': 2, | ||
'nmdc:ChromatographicSeparationProcess': 3, | ||
'nmdc:CollectingBiosamplesFromSite': 1, | ||
'nmdc:ControlledIdentifiedTermValue': 18, | ||
'nmdc:ControlledTermValue': 1, | ||
'nmdc:DataObject': 10, | ||
'nmdc:DissolvingProcess': 1, | ||
'nmdc:Doi': 4, | ||
'nmdc:Extraction': 9, | ||
'nmdc:FieldResearchSite': 1, | ||
'nmdc:FiltrationProcess': 1, | ||
'nmdc:FunctionalAnnotation': 3, | ||
'nmdc:FunctionalAnnotationAggMember': 2, | ||
'nmdc:GeneProduct': 1, | ||
'nmdc:GeolocationValue': 3, | ||
'nmdc:ImageValue': 2, | ||
'nmdc:Instrument': 2, | ||
'nmdc:LibraryPreparation': 4, | ||
'nmdc:MagBin': 1, | ||
'nmdc:MagsAnalysis': 1, | ||
'nmdc:MassSpectrometry': 1, | ||
'nmdc:MetabolomicsAnalysis': 2, | ||
'nmdc:MetagenomeAssembly': 2, | ||
'nmdc:MetagenomeSequencing': 2, | ||
'nmdc:MixingProcess': 1, | ||
'nmdc:NomAnalysis': 1, | ||
'nmdc:NucleotideSequencing': 8, | ||
'nmdc:OntologyClass': 18, | ||
'nmdc:PersonValue': 7, | ||
'nmdc:PlaceholderClass': 1, | ||
'nmdc:Pooling': 7, | ||
'nmdc:ProcessedSample': 5, | ||
'nmdc:Protocol': 3, | ||
'nmdc:ProtocolExecution': 2, | ||
'nmdc:QuantityValue': 19, | ||
'nmdc:ReadQcAnalysis': 4, | ||
'nmdc:Solution': 8, | ||
'nmdc:SolutionComponent': 9, | ||
'nmdc:Study': 10, | ||
'nmdc:SubSamplingProcess': 1, | ||
'nmdc:TextValue': 3, | ||
'nmdc:TimestampValue': 1, | ||
'nmdc:WorkflowChain': 3, | ||
'prov:Association': 2} | ||
|
||
|
||
Classes that are not instantiated in any src/data/valid YAML file: | ||
|
||
{'abstract': {'DataGeneration': {'declared_uri': 'nmdc:DataGeneration', | ||
'native_uri': 'nmdc:DataGeneration'}, | ||
'FluidHandling': {'declared_uri': 'nmdc:FluidHandling', | ||
'native_uri': 'nmdc:FluidHandling'}, | ||
'FunctionalAnnotationTerm': {'declared_uri': 'nmdc:FunctionalAnnotationTerm', | ||
'native_uri': 'nmdc:FunctionalAnnotationTerm'}, | ||
'MaterialEntity': {'declared_uri': 'nmdc:MaterialEntity', | ||
'native_uri': 'nmdc:MaterialEntity'}, | ||
'NamedThing': {'declared_uri': 'nmdc:NamedThing', | ||
'native_uri': 'nmdc:NamedThing'}, | ||
'PlannedProcess': {'declared_uri': 'OBI:0000011', | ||
'native_uri': 'nmdc:PlannedProcess'}, | ||
'Site': {'declared_uri': 'nmdc:Site', 'native_uri': 'nmdc:Site'}}, | ||
'concrete': {'AttributeValue': {'declared_uri': 'nmdc:AttributeValue', | ||
'native_uri': 'nmdc:AttributeValue'}, | ||
'BooleanValue': {'declared_uri': 'nmdc:BooleanValue', | ||
'native_uri': 'nmdc:BooleanValue'}, | ||
'Database': {'declared_uri': 'nmdc:Database', | ||
'native_uri': 'nmdc:Database'}, | ||
'DirectInfusionProcess': {'declared_uri': 'nmdc:DirectInfusionProcess', | ||
'native_uri': 'nmdc:DirectInfusionProcess'}, | ||
'EnvironmentalMaterialTerm': {'declared_uri': 'nmdc:EnvironmentalMaterialTerm', | ||
'native_uri': 'nmdc:EnvironmentalMaterialTerm'}, | ||
'FailureCategorization': {'declared_uri': 'nmdc:FailureCategorization', | ||
'native_uri': 'nmdc:FailureCategorization'}, | ||
'GenomeFeature': {'declared_uri': 'nmdc:GenomeFeature', | ||
'native_uri': 'nmdc:GenomeFeature'}, | ||
'IntegerValue': {'declared_uri': 'nmdc:IntegerValue', | ||
'native_uri': 'nmdc:IntegerValue'}, | ||
'MaterialProcessing': {'declared_uri': 'nmdc:MaterialProcessing', | ||
'native_uri': 'nmdc:MaterialProcessing'}, | ||
'MetaboliteQuantification': {'declared_uri': 'nmdc:MetaboliteQuantification', | ||
'native_uri': 'nmdc:MetaboliteQuantification'}, | ||
'MetagenomeAnnotation': {'declared_uri': 'nmdc:MetagenomeAnnotation', | ||
'native_uri': 'nmdc:MetagenomeAnnotation'}, | ||
'MetaproteomicsAnalysis': {'declared_uri': 'nmdc:MetaproteomicsAnalysis', | ||
'native_uri': 'nmdc:MetaproteomicsAnalysis'}, | ||
'MetatranscriptomeAnalysis': {'declared_uri': 'nmdc:MetatranscriptomeAnalysis', | ||
'native_uri': 'nmdc:MetatranscriptomeAnalysis'}, | ||
'MetatranscriptomeAnnotation': {'declared_uri': 'nmdc:MetatranscriptomeAnnotation', | ||
'native_uri': 'nmdc:MetatranscriptomeAnnotation'}, | ||
'MetatranscriptomeAssembly': {'declared_uri': 'nmdc:MetatranscriptomeAssembly', | ||
'native_uri': 'nmdc:MetatranscriptomeAssembly'}, | ||
'OrthologyGroup': {'declared_uri': 'nmdc:OrthologyGroup', | ||
'native_uri': 'nmdc:OrthologyGroup'}, | ||
'Pathway': {'declared_uri': 'nmdc:Pathway', | ||
'native_uri': 'nmdc:Pathway'}, | ||
'PeptideQuantification': {'declared_uri': 'nmdc:PeptideQuantification', | ||
'native_uri': 'nmdc:PeptideQuantification'}, | ||
'ProteinQuantification': {'declared_uri': 'nmdc:ProteinQuantification', | ||
'native_uri': 'nmdc:ProteinQuantification'}, | ||
'Reaction': {'declared_uri': 'nmdc:Reaction', | ||
'native_uri': 'nmdc:Reaction'}, | ||
'ReactionParticipant': {'declared_uri': 'nmdc:ReactionParticipant', | ||
'native_uri': 'nmdc:ReactionParticipant'}, | ||
'ReadBasedTaxonomyAnalysis': {'declared_uri': 'nmdc:ReadBasedTaxonomyAnalysis', | ||
'native_uri': 'nmdc:ReadBasedTaxonomyAnalysis'}, | ||
'Substance': {'declared_uri': 'nmdc:Substance', | ||
'native_uri': 'nmdc:Substance'}, | ||
'UrlValue': {'declared_uri': 'nmdc:UrlValue', | ||
'native_uri': 'nmdc:UrlValue'}, | ||
'WorkflowExecution': {'declared_uri': 'nmdc:WorkflowExecution', | ||
'native_uri': 'nmdc:WorkflowExecution'}}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
Report of slots that aren't associated with any classes: | ||
|
||
No classes for parent slot analysis_identifiers | ||
No classes for parent slot assembly_identifiers | ||
No classes for slot biogas_retention_time | ||
No classes for slot biogas_temperature | ||
No classes for parent slot biosample_identifiers | ||
No classes for slot completion_date | ||
No classes for slot date_created | ||
No classes for slot ecosystem_path_id | ||
No classes for parent slot emsl_identifiers | ||
No classes for slot emsl_store_temp | ||
No classes for slot etl_software_version | ||
No classes for parent slot gff_coordinate | ||
No classes for parent slot gnps_identifiers | ||
No classes for parent slot gold_identifiers | ||
No classes for parent slot gold_path_field | ||
No classes for parent slot has_participants | ||
No classes for parent slot igsn_identifiers | ||
No classes for slot input_volume | ||
No classes for slot insdc_analysis_identifiers | ||
No classes for parent slot insdc_identifiers | ||
No classes for slot insdc_secondary_sample_identifiers | ||
No classes for slot insdc_sra_ena_study_identifiers | ||
No classes for parent slot jgi_portal_identifiers | ||
No classes for slot material_component_separation | ||
No classes for parent slot metagenome_assembly_parameter | ||
No classes for slot mgnify_analysis_identifiers | ||
No classes for parent slot mgnify_identifiers | ||
No classes for slot modifier_substance | ||
No classes for parent slot neon_identifiers | ||
No classes for slot nucl_acid_ext, MIXS:0000037 | ||
No classes for parent slot object_set | ||
No classes for parent slot omics_processing_identifiers | ||
No classes for slot omics_type | ||
No classes for slot pool_dna_extracts, MIXS:0000325 | ||
No classes for parent slot read_qc_analysis_statistic | ||
No classes for slot samp_vol_we_dna_ext, MIXS:0000111 | ||
No classes for slot sample_collection_day | ||
No classes for slot sample_collection_hour | ||
No classes for slot sample_collection_minute | ||
No classes for slot sample_collection_month | ||
No classes for slot sample_collection_year | ||
No classes for slot seq_meth, MIXS:0000050 | ||
No classes for slot seq_quality_check, MIXS:0000051 | ||
No classes for slot soil_annual_season_temp | ||
No classes for parent slot study_identifiers | ||
No classes for slot value | ||
|
||
|
||
Report of enums that aren't associated with any slots: | ||
|
||
No slots for ProteolyticEnzymeEnum | ||
No slots for freq_clean_enum | ||
No slots for organism_count_enum | ||
No slots for plant_growth_med_enum | ||
No slots for samp_md_enum | ||
|
||
|
||
Report of all types. Manual review recommended: | ||
|
||
['bytes', | ||
'decimal degree', | ||
'language code', | ||
'unit', | ||
'string', | ||
'integer', | ||
'boolean', | ||
'float', | ||
'double', | ||
'decimal', | ||
'time', | ||
'date', | ||
'datetime', | ||
'date_or_datetime', | ||
'uriorcurie', | ||
'curie', | ||
'uri', | ||
'ncname', | ||
'objectidentifier', | ||
'nodeidentifier', | ||
'jsonpointer', | ||
'jsonpath', | ||
'sparqlpath', | ||
'external_identifier'] | ||
|
||
|
||
Report of all elements whose names contain whitespace: | ||
|
||
name: decimal degree | ||
description: A decimal degree expresses latitude or longitude as decimal fractions. | ||
from_schema: https://w3id.org/nmdc/core | ||
see_also: | ||
- https://en.wikipedia.org/wiki/Decimal_degrees | ||
base: float | ||
uri: xsd:decimal | ||
|
||
name: language code | ||
description: A language code conforming to ISO_639-1 | ||
from_schema: https://w3id.org/nmdc/core | ||
see_also: | ||
- https://en.wikipedia.org/wiki/ISO_639-1 | ||
base: str | ||
uri: xsd:language | ||
|
||
name: nucleic acid sequence source | ||
from_schema: https://w3id.org/nmdc/core | ||
|
||
name: workflow subset | ||
description: Subset consisting of just the workflow execution activities | ||
from_schema: https://w3id.org/nmdc/workflow_execution_activity | ||
|
||
name: sample subset | ||
description: Subset consisting of entities linked to the processing of samples. Currently, | ||
this subset consists of study, omics process, and biosample. | ||
from_schema: https://w3id.org/nmdc/nmdc | ||
|
||
name: data object subset | ||
description: Subset consisting of the data objects that either inputs or outputs of | ||
processes or workflows. | ||
from_schema: https://w3id.org/nmdc/nmdc | ||
|
||
|
||
|
||
Report of subsets usage: | ||
|
||
{'data object subset': ['DataObject'], | ||
'data_portal_subset': ['associated_dois', | ||
'doi_value', | ||
'doi_provider', | ||
'doi_category'], | ||
'environment': [], | ||
'investigation': [], | ||
'nucleic acid sequence source': [], | ||
'proteases': [], | ||
'sample subset': ['Biosample', 'Study', 'DataGeneration'], | ||
'sequencing': [], | ||
'workflow subset': ['WorkflowExecution', | ||
'MetagenomeAssembly', | ||
'MetatranscriptomeAssembly', | ||
'MetagenomeAnnotation', | ||
'MetatranscriptomeAnnotation', | ||
'MetatranscriptomeAnalysis', | ||
'MagsAnalysis', | ||
'MetagenomeSequencing', | ||
'ReadQcAnalysis', | ||
'ReadBasedTaxonomyAnalysis', | ||
'MetabolomicsAnalysis', | ||
'MetaproteomicsAnalysis', | ||
'NomAnalysis']} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import os | ||
import pprint | ||
|
||
import yaml | ||
from linkml_runtime import SchemaView | ||
|
||
# set the directory path here (relative path) | ||
source_directory = "src/data/valid" | ||
|
||
schema_file = 'src/schema/nmdc.yaml' | ||
|
||
schema_view = SchemaView(schema_file) | ||
|
||
|
||
def find_type_keys(data): | ||
""" | ||
Recursively searches for all "type" key values within a YAML structure. | ||
Args: | ||
data: The YAML data structure (dict or list). | ||
Returns: | ||
A dictionary where keys are type values and values are their counts. | ||
""" | ||
tc = {} | ||
if isinstance(data, dict): | ||
for key, value in data.items(): | ||
if key == "type": | ||
tc[value] = tc.get(value, 0) + 1 # Increment count | ||
else: | ||
tc.update(find_type_keys(value)) # Recursively call on values (objects or lists) | ||
elif isinstance(data, list): | ||
for item in data: | ||
tc.update(find_type_keys(item)) # Recursively call on list items | ||
return tc | ||
|
||
|
||
def process_all_files(directory): | ||
""" | ||
Processes all YAML files in a directory, finding all "type" key values and accumulating counts in a dictionary. | ||
Args: | ||
directory: The directory containing YAML files. | ||
Returns: | ||
A dictionary where keys are type values from all files and values are their total counts. | ||
""" | ||
tc = {} | ||
for filename in os.listdir(directory): | ||
if filename.endswith(".yaml"): | ||
filepath = os.path.join(directory, filename) | ||
with open(filepath, 'r') as f: | ||
data = yaml.safe_load(f) | ||
file_tc = find_type_keys(data) | ||
|
||
def accumulate_counts(target_dict, source_dict): | ||
for key, value in source_dict.items(): | ||
target_dict[key] = target_dict.get(key, 0) + value | ||
|
||
accumulate_counts(tc, file_tc) # Accumulate counts from each file | ||
return tc | ||
|
||
|
||
# Process all files and get the combined type counts | ||
type_counts = process_all_files(source_directory) | ||
|
||
print("Types asserted in any src/data/valid YAML file:") | ||
print("") | ||
pprint.pprint(type_counts) | ||
print("\n") | ||
|
||
schema_classes = schema_view.all_classes() | ||
|
||
available_classes = {} | ||
for ck, cv in schema_classes.items(): | ||
available_classes[ck] = { | ||
"declared_uri": schema_view.get_uri(cv, native=False), | ||
"native_uri": schema_view.get_uri(cv, native=True), | ||
} | ||
|
||
lacks_example = {"abstract": {}, "concrete": {}} | ||
for k, v in available_classes.items(): | ||
if v["declared_uri"] not in type_counts and v["native_uri"] not in type_counts: | ||
|
||
if schema_view.get_class(k).abstract: | ||
lacks_example["abstract"][k] = v | ||
else: | ||
lacks_example["concrete"][k] = v | ||
|
||
print("Classes that are not instantiated in any src/data/valid YAML file:") | ||
print("") | ||
pprint.pprint(lacks_example) |
Oops, something went wrong.