Merge pull request #111 from microbiomedata/schema-pattern-linting

schema pattern linting
microbiomedata · Mar 29, 2024 · 5026c97 · 5026c97
2 parents c49823b + 3dd5e90
commit 5026c97
Show file tree

Hide file tree

Showing 5 changed files with 437 additions and 0 deletions.
diff --git a/assets/check_examples_class_coverage.txt b/assets/check_examples_class_coverage.txt
@@ -0,0 +1,115 @@
+Types asserted in any src/data/valid YAML file:
+
+{'nmdc:Biosample': 18,
+ 'nmdc:ChemicalConversionProcess': 2,
+ 'nmdc:ChemicalEntity': 2,
+ 'nmdc:ChromatographicSeparationProcess': 3,
+ 'nmdc:CollectingBiosamplesFromSite': 1,
+ 'nmdc:ControlledIdentifiedTermValue': 18,
+ 'nmdc:ControlledTermValue': 1,
+ 'nmdc:DataObject': 10,
+ 'nmdc:DissolvingProcess': 1,
+ 'nmdc:Doi': 4,
+ 'nmdc:Extraction': 9,
+ 'nmdc:FieldResearchSite': 1,
+ 'nmdc:FiltrationProcess': 1,
+ 'nmdc:FunctionalAnnotation': 3,
+ 'nmdc:FunctionalAnnotationAggMember': 2,
+ 'nmdc:GeneProduct': 1,
+ 'nmdc:GeolocationValue': 3,
+ 'nmdc:ImageValue': 2,
+ 'nmdc:Instrument': 2,
+ 'nmdc:LibraryPreparation': 4,
+ 'nmdc:MagBin': 1,
+ 'nmdc:MagsAnalysis': 1,
+ 'nmdc:MassSpectrometry': 1,
+ 'nmdc:MetabolomicsAnalysis': 2,
+ 'nmdc:MetagenomeAssembly': 2,
+ 'nmdc:MetagenomeSequencing': 2,
+ 'nmdc:MixingProcess': 1,
+ 'nmdc:NomAnalysis': 1,
+ 'nmdc:NucleotideSequencing': 8,
+ 'nmdc:OntologyClass': 18,
+ 'nmdc:PersonValue': 7,
+ 'nmdc:PlaceholderClass': 1,
+ 'nmdc:Pooling': 7,
+ 'nmdc:ProcessedSample': 5,
+ 'nmdc:Protocol': 3,
+ 'nmdc:ProtocolExecution': 2,
+ 'nmdc:QuantityValue': 19,
+ 'nmdc:ReadQcAnalysis': 4,
+ 'nmdc:Solution': 8,
+ 'nmdc:SolutionComponent': 9,
+ 'nmdc:Study': 10,
+ 'nmdc:SubSamplingProcess': 1,
+ 'nmdc:TextValue': 3,
+ 'nmdc:TimestampValue': 1,
+ 'nmdc:WorkflowChain': 3,
+ 'prov:Association': 2}
+
+
+Classes that are not instantiated in any src/data/valid YAML file:
+
+{'abstract': {'DataGeneration': {'declared_uri': 'nmdc:DataGeneration',
+                                 'native_uri': 'nmdc:DataGeneration'},
+              'FluidHandling': {'declared_uri': 'nmdc:FluidHandling',
+                                'native_uri': 'nmdc:FluidHandling'},
+              'FunctionalAnnotationTerm': {'declared_uri': 'nmdc:FunctionalAnnotationTerm',
+                                           'native_uri': 'nmdc:FunctionalAnnotationTerm'},
+              'MaterialEntity': {'declared_uri': 'nmdc:MaterialEntity',
+                                 'native_uri': 'nmdc:MaterialEntity'},
+              'NamedThing': {'declared_uri': 'nmdc:NamedThing',
+                             'native_uri': 'nmdc:NamedThing'},
+              'PlannedProcess': {'declared_uri': 'OBI:0000011',
+                                 'native_uri': 'nmdc:PlannedProcess'},
+              'Site': {'declared_uri': 'nmdc:Site', 'native_uri': 'nmdc:Site'}},
+ 'concrete': {'AttributeValue': {'declared_uri': 'nmdc:AttributeValue',
+                                 'native_uri': 'nmdc:AttributeValue'},
+              'BooleanValue': {'declared_uri': 'nmdc:BooleanValue',
+                               'native_uri': 'nmdc:BooleanValue'},
+              'Database': {'declared_uri': 'nmdc:Database',
+                           'native_uri': 'nmdc:Database'},
+              'DirectInfusionProcess': {'declared_uri': 'nmdc:DirectInfusionProcess',
+                                        'native_uri': 'nmdc:DirectInfusionProcess'},
+              'EnvironmentalMaterialTerm': {'declared_uri': 'nmdc:EnvironmentalMaterialTerm',
+                                            'native_uri': 'nmdc:EnvironmentalMaterialTerm'},
+              'FailureCategorization': {'declared_uri': 'nmdc:FailureCategorization',
+                                        'native_uri': 'nmdc:FailureCategorization'},
+              'GenomeFeature': {'declared_uri': 'nmdc:GenomeFeature',
+                                'native_uri': 'nmdc:GenomeFeature'},
+              'IntegerValue': {'declared_uri': 'nmdc:IntegerValue',
+                               'native_uri': 'nmdc:IntegerValue'},
+              'MaterialProcessing': {'declared_uri': 'nmdc:MaterialProcessing',
+                                     'native_uri': 'nmdc:MaterialProcessing'},
+              'MetaboliteQuantification': {'declared_uri': 'nmdc:MetaboliteQuantification',
+                                           'native_uri': 'nmdc:MetaboliteQuantification'},
+              'MetagenomeAnnotation': {'declared_uri': 'nmdc:MetagenomeAnnotation',
+                                       'native_uri': 'nmdc:MetagenomeAnnotation'},
+              'MetaproteomicsAnalysis': {'declared_uri': 'nmdc:MetaproteomicsAnalysis',
+                                         'native_uri': 'nmdc:MetaproteomicsAnalysis'},
+              'MetatranscriptomeAnalysis': {'declared_uri': 'nmdc:MetatranscriptomeAnalysis',
+                                            'native_uri': 'nmdc:MetatranscriptomeAnalysis'},
+              'MetatranscriptomeAnnotation': {'declared_uri': 'nmdc:MetatranscriptomeAnnotation',
+                                              'native_uri': 'nmdc:MetatranscriptomeAnnotation'},
+              'MetatranscriptomeAssembly': {'declared_uri': 'nmdc:MetatranscriptomeAssembly',
+                                            'native_uri': 'nmdc:MetatranscriptomeAssembly'},
+              'OrthologyGroup': {'declared_uri': 'nmdc:OrthologyGroup',
+                                 'native_uri': 'nmdc:OrthologyGroup'},
+              'Pathway': {'declared_uri': 'nmdc:Pathway',
+                          'native_uri': 'nmdc:Pathway'},
+              'PeptideQuantification': {'declared_uri': 'nmdc:PeptideQuantification',
+                                        'native_uri': 'nmdc:PeptideQuantification'},
+              'ProteinQuantification': {'declared_uri': 'nmdc:ProteinQuantification',
+                                        'native_uri': 'nmdc:ProteinQuantification'},
+              'Reaction': {'declared_uri': 'nmdc:Reaction',
+                           'native_uri': 'nmdc:Reaction'},
+              'ReactionParticipant': {'declared_uri': 'nmdc:ReactionParticipant',
+                                      'native_uri': 'nmdc:ReactionParticipant'},
+              'ReadBasedTaxonomyAnalysis': {'declared_uri': 'nmdc:ReadBasedTaxonomyAnalysis',
+                                            'native_uri': 'nmdc:ReadBasedTaxonomyAnalysis'},
+              'Substance': {'declared_uri': 'nmdc:Substance',
+                            'native_uri': 'nmdc:Substance'},
+              'UrlValue': {'declared_uri': 'nmdc:UrlValue',
+                           'native_uri': 'nmdc:UrlValue'},
+              'WorkflowExecution': {'declared_uri': 'nmdc:WorkflowExecution',
+                                    'native_uri': 'nmdc:WorkflowExecution'}}}
diff --git a/assets/schema_pattern_linting.txt b/assets/schema_pattern_linting.txt
@@ -0,0 +1,150 @@
+Report of slots that aren't associated with any classes:
+
+No classes for parent slot analysis_identifiers
+No classes for parent slot assembly_identifiers
+No classes for slot biogas_retention_time
+No classes for slot biogas_temperature
+No classes for parent slot biosample_identifiers
+No classes for slot completion_date
+No classes for slot date_created
+No classes for slot ecosystem_path_id
+No classes for parent slot emsl_identifiers
+No classes for slot emsl_store_temp
+No classes for slot etl_software_version
+No classes for parent slot gff_coordinate
+No classes for parent slot gnps_identifiers
+No classes for parent slot gold_identifiers
+No classes for parent slot gold_path_field
+No classes for parent slot has_participants
+No classes for parent slot igsn_identifiers
+No classes for slot input_volume
+No classes for slot insdc_analysis_identifiers
+No classes for parent slot insdc_identifiers
+No classes for slot insdc_secondary_sample_identifiers
+No classes for slot insdc_sra_ena_study_identifiers
+No classes for parent slot jgi_portal_identifiers
+No classes for slot material_component_separation
+No classes for parent slot metagenome_assembly_parameter
+No classes for slot mgnify_analysis_identifiers
+No classes for parent slot mgnify_identifiers
+No classes for slot modifier_substance
+No classes for parent slot neon_identifiers
+No classes for slot nucl_acid_ext, MIXS:0000037
+No classes for parent slot object_set
+No classes for parent slot omics_processing_identifiers
+No classes for slot omics_type
+No classes for slot pool_dna_extracts, MIXS:0000325
+No classes for parent slot read_qc_analysis_statistic
+No classes for slot samp_vol_we_dna_ext, MIXS:0000111
+No classes for slot sample_collection_day
+No classes for slot sample_collection_hour
+No classes for slot sample_collection_minute
+No classes for slot sample_collection_month
+No classes for slot sample_collection_year
+No classes for slot seq_meth, MIXS:0000050
+No classes for slot seq_quality_check, MIXS:0000051
+No classes for slot soil_annual_season_temp
+No classes for parent slot study_identifiers
+No classes for slot value
+
+
+Report of enums that aren't associated with any slots:
+
+No slots for ProteolyticEnzymeEnum
+No slots for freq_clean_enum
+No slots for organism_count_enum
+No slots for plant_growth_med_enum
+No slots for samp_md_enum
+
+
+Report of all types. Manual review recommended:
+
+['bytes',
+ 'decimal degree',
+ 'language code',
+ 'unit',
+ 'string',
+ 'integer',
+ 'boolean',
+ 'float',
+ 'double',
+ 'decimal',
+ 'time',
+ 'date',
+ 'datetime',
+ 'date_or_datetime',
+ 'uriorcurie',
+ 'curie',
+ 'uri',
+ 'ncname',
+ 'objectidentifier',
+ 'nodeidentifier',
+ 'jsonpointer',
+ 'jsonpath',
+ 'sparqlpath',
+ 'external_identifier']
+
+
+Report of all elements whose names contain whitespace:
+
+name: decimal degree
+description: A decimal degree expresses latitude or longitude as decimal fractions.
+from_schema: https://w3id.org/nmdc/core
+see_also:
+- https://en.wikipedia.org/wiki/Decimal_degrees
+base: float
+uri: xsd:decimal
+
+name: language code
+description: A language code conforming to ISO_639-1
+from_schema: https://w3id.org/nmdc/core
+see_also:
+- https://en.wikipedia.org/wiki/ISO_639-1
+base: str
+uri: xsd:language
+
+name: nucleic acid sequence source
+from_schema: https://w3id.org/nmdc/core
+
+name: workflow subset
+description: Subset consisting of just the workflow execution activities
+from_schema: https://w3id.org/nmdc/workflow_execution_activity
+
+name: sample subset
+description: Subset consisting of entities linked to the processing of samples.  Currently,
+  this subset consists of study, omics process, and biosample.
+from_schema: https://w3id.org/nmdc/nmdc
+
+name: data object subset
+description: Subset consisting of the data objects that either inputs or outputs of
+  processes or workflows.
+from_schema: https://w3id.org/nmdc/nmdc
+
+
+
+Report of subsets usage:
+
+{'data object subset': ['DataObject'],
+ 'data_portal_subset': ['associated_dois',
+                        'doi_value',
+                        'doi_provider',
+                        'doi_category'],
+ 'environment': [],
+ 'investigation': [],
+ 'nucleic acid sequence source': [],
+ 'proteases': [],
+ 'sample subset': ['Biosample', 'Study', 'DataGeneration'],
+ 'sequencing': [],
+ 'workflow subset': ['WorkflowExecution',
+                     'MetagenomeAssembly',
+                     'MetatranscriptomeAssembly',
+                     'MetagenomeAnnotation',
+                     'MetatranscriptomeAnnotation',
+                     'MetatranscriptomeAnalysis',
+                     'MagsAnalysis',
+                     'MetagenomeSequencing',
+                     'ReadQcAnalysis',
+                     'ReadBasedTaxonomyAnalysis',
+                     'MetabolomicsAnalysis',
+                     'MetaproteomicsAnalysis',
+                     'NomAnalysis']}
diff --git a/project.Makefile b/project.Makefile
@@ -544,3 +544,9 @@ local/mongo_as_nmdc_database_cuire_repaired_stamped.ttl: local/mongo_as_nmdc_dat
 	$(RUN) python src/scripts/date_created_blank_node.py > local/date_created_blank_node.ttl
 	cat $^ local/date_created_blank_node.ttl > $@
 	rm local/date_created_blank_node.ttl
+
+assets/check_examples_class_coverage.txt:
+	$(RUN) python src/scripts/check_examples_class_coverage.py > $@
+
+assets/schema_pattern_linting.txt:
+	$(RUN) python src/scripts/schema_pattern_linting.py > $@
diff --git a/src/scripts/check_examples_class_coverage.py b/src/scripts/check_examples_class_coverage.py
@@ -0,0 +1,92 @@
+import os
+import pprint
+
+import yaml
+from linkml_runtime import SchemaView
+
+# set the directory path here (relative path)
+source_directory = "src/data/valid"
+
+schema_file = 'src/schema/nmdc.yaml'
+
+schema_view = SchemaView(schema_file)
+
+
+def find_type_keys(data):
+    """
+    Recursively searches for all "type" key values within a YAML structure.
+
+    Args:
+        data: The YAML data structure (dict or list).
+
+    Returns:
+        A dictionary where keys are type values and values are their counts.
+    """
+    tc = {}
+    if isinstance(data, dict):
+        for key, value in data.items():
+            if key == "type":
+                tc[value] = tc.get(value, 0) + 1  # Increment count
+            else:
+                tc.update(find_type_keys(value))  # Recursively call on values (objects or lists)
+    elif isinstance(data, list):
+        for item in data:
+            tc.update(find_type_keys(item))  # Recursively call on list items
+    return tc
+
+
+def process_all_files(directory):
+    """
+    Processes all YAML files in a directory, finding all "type" key values and accumulating counts in a dictionary.
+
+    Args:
+        directory: The directory containing YAML files.
+
+    Returns:
+        A dictionary where keys are type values from all files and values are their total counts.
+    """
+    tc = {}
+    for filename in os.listdir(directory):
+        if filename.endswith(".yaml"):
+            filepath = os.path.join(directory, filename)
+            with open(filepath, 'r') as f:
+                data = yaml.safe_load(f)
+                file_tc = find_type_keys(data)
+
+                def accumulate_counts(target_dict, source_dict):
+                    for key, value in source_dict.items():
+                        target_dict[key] = target_dict.get(key, 0) + value
+
+                accumulate_counts(tc, file_tc)  # Accumulate counts from each file
+    return tc
+
+
+# Process all files and get the combined type counts
+type_counts = process_all_files(source_directory)
+
+print("Types asserted in any src/data/valid YAML file:")
+print("")
+pprint.pprint(type_counts)
+print("\n")
+
+schema_classes = schema_view.all_classes()
+
+available_classes = {}
+for ck, cv in schema_classes.items():
+    available_classes[ck] = {
+        "declared_uri": schema_view.get_uri(cv, native=False),
+        "native_uri": schema_view.get_uri(cv, native=True),
+    }
+
+lacks_example = {"abstract": {}, "concrete": {}}
+for k, v in available_classes.items():
+    if v["declared_uri"] not in type_counts and v["native_uri"] not in type_counts:
+
+        if schema_view.get_class(k).abstract:
+            lacks_example["abstract"][k] = v
+        else:
+            lacks_example["concrete"][k] = v
+
+print("Classes that are not instantiated in any src/data/valid YAML file:")
+print("")
+pprint.pprint(lacks_example)