microbiomedata · brynnz22 · May 14, 2024 · Apr 23, 2024 · Apr 23, 2024 · Apr 23, 2024
diff --git a/nmdc_schema/migrators/migrator_from_X_to_PR31.py b/nmdc_schema/migrators/migrator_from_X_to_PR31.py
@@ -0,0 +1,85 @@
+from nmdc_schema.migrators.migrator_base import MigratorBase
+from nmdc_schema.migrators.adapters.adapter_base import AdapterBase
+from difflib import SequenceMatcher
+
+class Migrator(MigratorBase):
+    r"""
+    Migrates data from X to PR31, removes used slot from WorkflowExecution subclasses and checks that the 
+    value in the used slot on the WorkflowExecution classes matches the value on the DataGeneration 
+    instances in the instrument_name slot.
+    """
+
+    _from_version = "X"
+    _to_version = "PR31"
+
+    def upgrade(self):
+        r"""Migrates the database from conforming to the original schema, to conforming to the new schema."""
+
+        workflow_execution_collection_names = [
+            "mags_activity_set",
+            "metabolomics_analysis_activity_set",
+            "metagenome_annotation_activity_set",
+            "metagenome_assembly_set",
+            "metagenome_sequencing_activity_set",
+            "metatranscriptome_activity_set",
+            "nom_analysis_activity_set",
+            "omics_processing_set",
+            "read_based_taxonomy_analysis_activity_set",
+            "read_qc_analysis_activity_set"
+            "metaproteomics_analysis_activity_set"   
+        ]
+
+        for collection_name in workflow_execution_collection_names:
+            self.adapter.process_each_document(
+                collection_name=collection_name,
+                pipeline=[self.remove_used_slot],
+            )
+
+    def preprocess_string(self, s):
+        r"""
+        Normalizes strings prior to using SequenceMatcher. Removes white spaces, hyphens, and 
+        underscores from a string so difflib's SequenceMatcher can find the longest contiguous 
+        matching subsequence between two sequences and these characters will not interfere.
+        >>> m = Migrator()
+        >>> m.preprocess_string('a  b_-_c -de:f g')
+        'abcde:fg'
+        """
+
+        return s.replace(" ", "").replace("_","").replace("-","")
+
+    def remove_used_slot(self, doc: dict) -> dict:
+        r"""
+        Removes the `used` slot from `WorkflowExecution` subclasses if the value matches the 
+        instrument_name slot from the corresponding `OmicsProcessing` document by the longest
+        common sequence.
+
+        >>> from nmdc_schema.migrators.adapters.dictionary_adapter import DictionaryAdapter
+        >>>
+        >>> database = {'omics_processing_set':[{'id':'nmdc:omcp-123', 'instrument_name':'nmdc:wfc-456'}]}  # in this example, our data store is a Python dictionary
+        >>> adapter = DictionaryAdapter(database=database)
+        >>> m = Migrator(adapter=adapter)
+        >>> m.remove_used_slot({'id': 'nmdc:metab-123', 'used': 'nmdc:wfc-456', 'was_informed_by': 'nmdc:omcp-123'})
+        {'id': 'nmdc:metab-123', 'was_informed_by': 'nmdc:omcp-123'}
+        """
+
+        if "used" in doc:
+            omics_processing_doc = self.adapter.get_document_having_value_in_field(
+                collection_name="omics_processing_set", field_name="id", value=doc["was_informed_by"]
+            )
+
+            # Preprocess instrument strings to ignore hyphens, underscores, and blank spaces
+            processed_workflow_instrument_string = self.preprocess_string(doc["used"])
+            processed_omics_doc_instrument_string = self.preprocess_string(omics_processing_doc["instrument_name"])
+
+            similarity_ratio = SequenceMatcher(None, processed_workflow_instrument_string, processed_omics_doc_instrument_string).ratio()
+            threshold = 0.8
+            if similarity_ratio >= threshold:
+                if similarity_ratio < 1.0:
+                    self.logger.info(f"Workflow with id {doc['id']} has instrument: {doc['used']} matches OmicsProcessing doc instrument: {omics_processing_doc['instrument_name']} well enough")
+                doc.pop("used")
+            else:
+                self.logger.error(f"Workflow doc {doc['id']} with instrument: {doc['used']} does not match {omics_processing_doc['instrument_name']}")
+
+        return doc
+
+
diff --git a/src/data/valid/Database-neon_Biosample_to_DataObject_NEON.yaml b/src/data/valid/Database-neon_Biosample_to_DataObject_NEON.yaml
@@ -126,13 +126,19 @@ data_generation_set:
       - nmdc:dobj-12-jdhk9537
       - nmdc:dobj-12-yx0tfp52
     instrument_used:
-      - nmdc:inst-12-yx0tfp52
+      - nmdc:inst-14-xx07be40
     part_of:
       - nmdc:dgns-11-34xj1150
     processing_institution: Battelle
     type: nmdc:NucleotideSequencing
     associated_studies: 
       - nmdc:sty-11-34xj1150
+instrument_set:
+  - id: nmdc:inst-14-xx07be40
+    model: novaseq
+    name: Illumina NovaSeq
+    vendor: illumina
+    type: nmdc:Instrument
 workflow_chain_set:
   - id: nmdc:wfch-11-ab
     analyte_category: metagenome

diff --git a/src/schema/basic_classes.yaml b/src/schema/basic_classes.yaml
@@ -73,7 +73,6 @@ classes:
     slots:
       - has_input
       - has_output
-      - instrument_used
       - processing_institution
       - protocol_link
       - start_date
@@ -309,6 +308,7 @@ classes:
       - mod_date
       - part_of
       - principal_investigator
+      - instrument_used
     slot_usage:
       has_input:
         required: true
@@ -521,6 +521,7 @@ slots:
     range: Instrument
     multivalued: true
     description: What instrument was used during DataGeneration or MaterialProcessing.
+    pattern: "^nmdc:inst-[0-9][a-z]{0,6}[0-9]-[A-Za-z0-9]{1,}(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$"
 
   model:
     range: InstrumentModelEnum

diff --git a/src/schema/core.yaml b/src/schema/core.yaml
@@ -1183,7 +1183,8 @@ classes:
     description:
       A process that takes one or more samples as inputs and generates
       one or more samples as outputs.
-
+    slots:
+      - instrument_used
     notes:
       - This class is a replacement for BiosampleProcessing.
     slot_usage: