Merge branch 'popsim-consortium:main' into mus_mus_dem_history

popsim-consortium · Oct 9, 2023 · 51386ba · 51386ba
2 parents abc45cb + d89010f
commit 51386ba
Show file tree

Hide file tree

Showing 13 changed files with 571 additions and 108 deletions.
diff --git a/.github/mergify.yml b/.github/mergify.yml
@@ -6,8 +6,8 @@ queue_rules:
       - check-success=pre-commit
       - check-success=tests (ubuntu-20.04, 3.7)
       - check-success=tests (ubuntu-20.04, 3.10)
-      - check-success=tests (macos-10.15, 3.7)
-      - check-success=tests (macos-10.15, 3.10)
+      - check-success=tests (macos-11, 3.7)
+      - check-success=tests (macos-11, 3.10)
       - check-success=tests (windows-latest, 3.7)
       - check-success=tests (windows-latest, 3.10)
       - check-success=codecov/patch

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -17,13 +17,12 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, macos-10.15, windows-latest]
-        python: [3.7, "3.10"]
+        os: [ubuntu-20.04, macos-11, windows-latest]
+        python: [3.8, "3.10"]
     env:
       CONDA_ENV_NAME: stdpopsim
       OS: ${{ matrix.os }}
       PYTHON: ${{ matrix.python }}
-
     steps:
       - name: cancel previous runs
         uses: styfle/cancel-workflow-action@0.6.0
@@ -36,7 +35,7 @@ jobs:
       - name: find conda
         id: find-conda
         run: |
-          echo "::set-output name=CONDA::$CONDA"
+          echo "name=CONDA::$CONDA" >> $GITHUB_OUTPUT
 
       - name: fix conda permissions
         if: runner.os == 'macOS'
@@ -46,10 +45,10 @@ jobs:
 
       - name: cache conda
         id: cache
-        uses: actions/cache@v2
+        uses: actions/cache@v3
         env:
           # Increase this to reset the cache if the key hasn't changed.
-          CACHE_NUM: 2
+          CACHE_NUM: 5
         with:
           path: |
             ${{ steps.find-conda.outputs.CONDA }}/envs/${{ env.CONDA_ENV_NAME }}
@@ -99,3 +98,6 @@ jobs:
         with:
           fail_ci_if_error: true
           env_vars: OS,PYTHON
+          # Use upload token to avoid upload failures.
+          # https://github.com/codecov/codecov-action/issues/837
+          token: 8ae76ade-9d0e-4a2d-b9ef-1180235be07f
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,6 +2,22 @@
 [0.2.1] - 2023-XX-XX
 --------------------
 
+**Breaking changes**:
+
+- To add the possibility for a relationship between dominance and selection
+    coefficient, now each stdpopsim MutationType might have more than one
+    underlying SLiM mutation type; so, where this is recorded in top-level
+    metadata (under `ts.metadata["stdpopsim"]["DFEs"]`) is now a list
+    instead of a single value. This will not affect anyone who is not
+    parsing the metadata related to DFEs.
+
+**New features**:
+
+- *Relationship between dominance and selection coefficient:*
+    Added the `dominance_coeff_list` argument to `MutationType`, allowing
+    for DFEs with a discretized relationship between h and s.
+    (:user:`petrelharp`, :pr:`1498`)
+
 **New species**:
 
 - Mus musculus (:user:`peterdfields`, :pr:`1437`).

diff --git a/docs/parameter_tables/OrySat/BottleneckMigration_3C07.csv b/docs/parameter_tables/OrySat/BottleneckMigration_3C07.csv
@@ -10,4 +10,4 @@ Migration rate (x10^-5),2.33,TRJ-RUF migration rate (per gen.)
 Epoch Time (gen.),"12,000",Time of origin and beginning of domestication bottleneck of IND and TRJ
 Epoch Time (gen.),"9,000",Time of partial recovery from bottleneck of IND and TRJ
 Generation time (yrs.),1,Generation time
-Mutation rate,3.2e-9,Per-base per-generation mutation rate
+Mutation rate,9.03e-9,Per-base per-generation mutation rate
diff --git a/requirements/CI/requirements.txt b/requirements/CI/requirements.txt
@@ -1,11 +1,10 @@
-codecov==2.1.13
 coverage==6.3.3
 flake8==5.0.4
-pytest==7.2.1
-pytest-cov==4.0.0
-pytest-xdist==3.2.0
+pytest==7.3.2
+pytest-cov==4.1.0
+pytest-xdist==3.3.1
 setuptools-scm==7.1.0
-sphinx==4.3.2
+sphinx==5.0.2
 sphinx-argparse==0.4.0
 sphinx-issues==3.0.1
 sphinx_rtd_theme==1.2.0
@@ -15,11 +14,9 @@ attrs==21.4.0
 appdirs==1.4.4
 humanize==4.6.0
 pyslim==1.0.1
-numpy==1.21.5; python_version=='3.7'
-numpy==1.22.3; python_version>='3.8'
-scipy; python_version=='3.7'
-scipy==1.10.1; python_version>='3.8'
-scikit-allel==1.3.5
+numpy==1.22.3
+scipy==1.10.1
+scikit-allel==1.3.6
 biopython==1.80
 demes==0.2.2
 demesdraw==0.3.1

diff --git a/requirements/development.txt b/requirements/development.txt
@@ -1,6 +1,5 @@
 black>=22.1.0
 blacken-docs
-codecov
 coverage
 daiquiri
 flake8

diff --git a/stdpopsim/catalog/HomSap/dfes.py b/stdpopsim/catalog/HomSap/dfes.py
@@ -34,6 +34,9 @@ def _KimDFE():
     Na = 12378
     gamma_scale = 875
     gamma_shape = 0.186  # shape
+    # Extra factor of 2 in mean is to account for difference between tools
+    # in how fitness is defined
+    # (1+s for homozygote in SLiM versus 1+2s in dadi)
     gamma_mean = (-gamma_shape * gamma_scale * 2) / (2 * Na)  # expected value
     h = 0.5  # dominance coefficient
     negative = stdpopsim.MutationType(
@@ -74,7 +77,10 @@ def _HuberDFE():
     ]
     neutral = stdpopsim.MutationType()
     gamma_shape = 0.19  # shape
-    gamma_mean = -0.014  # expected value
+    # Extra factor of 2 in mean is to account for difference between tools
+    # in how fitness is defined
+    # (1+s for homozygote in SLiM versus 1+2s in dadi)
+    gamma_mean = -0.014 * 2  # expected value
     h = 0.5  # dominance coefficient
     negative = stdpopsim.MutationType(
         dominance_coeff=h,
@@ -141,3 +147,56 @@ def _HuberLogNormalDFE():
 
 
 _species.add_dfe(_HuberLogNormalDFE())
+
+
+def _KyriazisDFE():
+    id = "Mixed_K23"
+    description = "Deleterious Gamma DFE with additional lethals"
+    long_description = """
+    The DFE estimated from human data recommended in Kyriazis et al.
+    (2023), https://doi.org/10.1086/726736, for general use.
+    This model is similar to the Kim et al. (2017) DFE based on human
+    genetic data, modified to include the dominance distribution from
+    Henn et al. (2016).
+    The model is also augmented with an additional proportion of 0.3% of
+    recessive lethals, based on the analysis of Wade et al. (2023).
+    """
+    citations = [
+        stdpopsim.Citation(
+            author="Kyriazis et al.",
+            year=2023,
+            doi="https://doi.org/10.1086/726736",
+            reasons={stdpopsim.CiteReason.DFE},
+        )
+    ]
+    neutral = stdpopsim.MutationType()
+    gamma_mean = -0.0131
+    gamma_shape = 0.186
+    coefs = [0.45, 0.2, 0.05, 0]
+    breaks = [0.001, 0.01, 0.1]
+    gamma = stdpopsim.MutationType(
+        dominance_coeff_list=coefs,
+        dominance_coeff_breaks=breaks,
+        distribution_type="g",  # gamma distribution
+        distribution_args=[gamma_mean, gamma_shape],
+    )
+    lethal = stdpopsim.MutationType(
+        distribution_type="f",  # fixed value
+        distribution_args=[-1],  # fitness in SLiM for homozygotes is multiiplied by 1+s
+        dominance_coeff=0,
+    )
+    proportion_deleterious = 2.31 / (1 + 2.31)
+    lethal_prop = proportion_deleterious * 0.003  # 0.3% lethals
+    gamma_prop = proportion_deleterious - lethal_prop
+    neutral_prop = 1 - proportion_deleterious
+    return stdpopsim.DFE(
+        id=id,
+        description=description,
+        long_description=long_description,
+        mutation_types=[neutral, gamma, lethal],
+        proportions=[neutral_prop, gamma_prop, lethal_prop],
+        citations=citations,
+    )
+
+
+_species.add_dfe(_KyriazisDFE())
diff --git a/stdpopsim/catalog/OrySat/demographic_models.py b/stdpopsim/catalog/OrySat/demographic_models.py
@@ -34,24 +34,28 @@ def _BottMig_3C07():
     ]
 
     generation_time = 1
-    mutation_rate = 3.2e-9
 
     # First we set out the maximum likelihood values of the various parameters
     # given in Table 2.
     # To rescale the parameters, Caicedo et al. assumed that rice cultivation
     # began around 12,000 years (and therefore, generations) ago
-    # T_1 × 2 x N_rufi = 12000
 
     T_1 = 12000  # time to the start of domestication
-    T_B = 9000  # bottleneck lasted for 3000 years/generations
+    T_B = 0.75 * T_1  # bottleneck lasted for 3000 years/generations
 
     N_rufi = 12000 / (0.04 * 2)  # ancestral population size = 150000
-    N_trj = 0.12 * 150000  # population size for tropical japonica
-    N_ind = 0.27 * 150000  # population size for indica
+    N_trj = 0.12 * N_rufi  # population size for tropical japonica
+    N_ind = 0.27 * N_rufi  # population size for indica
     N_B = (
-        0.0055 * 150000
+        0.0055 * N_rufi
     )  # bottleneck population size for both indica and tropical japonica
 
+    # theta-rufi is Watterson's estimate of theta for O. rufipogon
+    # from Table 1
+    theta_rufi = 5.42e-3
+    mutation_rate = theta_rufi / (4 * N_rufi)
+
+    # number of migrants coming in
     # to japonica = 0.42, to indica = 0.945, to rufipogon = 3.5
 
     # Migration rates in the form M_from_to, thinking forward in time

diff --git a/stdpopsim/dfe.py b/stdpopsim/dfe.py
@@ -8,11 +8,17 @@
 import numpy as np
 
 
+def _copy_converter(x):
+    if isinstance(x, list):
+        x = x.copy()
+    return x
+
+
 @attr.s(kw_only=True)
 class MutationType(object):
     """
-    Class representing a "type" of mutation.  The design closely (exactly)
-    mirrors SLiM's MutationType class.
+    Class representing a "type" of mutation.  The design closely mirrors SLiM's
+    MutationType class.
 
     The main thing that mutation types carry is a way of drawing a selection
     coefficient for each new mutation. This ``distribution_type`` should be one
@@ -23,6 +29,7 @@ class MutationType(object):
     - ``g``: gamma, two parameters (mean, shape)
     - ``n``: normal, two parameters (mean, SD)
     - ``w``: Weibull, two parameters (scale, shape)
+    - ``u``: Uniform, two parameters (min, max)
     - ``lp``: positive logNormal, two parameters (mean and sd on log scale; see rlnorm)
     - ``ln``: negative logNormal, two parameters (mean and sd on log scale; see rlnorm)
 
@@ -31,28 +38,99 @@ class MutationType(object):
     exponential and gamma, a negative mean can be provided, obtaining always
     negative values.
 
+    Instead of a single dominance coefficient (which would be specified with
+    `dominance_coeff`), a discretized relationship between dominance and
+    selection coefficient can be implemented: if dominance_coeff_list is
+    provided, mutations with selection coefficient s for which
+    dominance_coeff_breaks[k-1] <= s <= dominance_coeff_breaks[k] will have
+    dominance coefficient dominance_coeff[k]. In other words, the first entry
+    of dominance_coeff_list applies to any mutations with selection coefficient
+    below the first entry of dominance_coeff_breaks; the second entry of
+    dominance_coeff_list applies to mutations with selection coefficient
+    between the first and second entries of dominance_coeff_breaks, and so
+    forth. The list of breaks must therefore be of length one less than the
+    list of dominance coefficients.
+
     :ivar distribution_type: A one-letter abbreviation for the distribution of
         fitness effects that each new mutation of this type draws from (see below).
     :vartype distribution_type: str
     :ivar distribution_args: Arguments for the distribution type.
     :vartype distribution_type: list
-    :ivar dominance_coeff: A single dominance coefficient (negative = underdominance,
+    :ivar dominance_coeff: The dominance coefficient (negative = underdominance,
         0 = recessive, 0.5 = additive, 1.0 = completely dominant, > 1.0 = overdominant)
+        Default: 0.5.
     :vartype dominance_coeff: float
     :ivar convert_to_substitution: Whether to retain any fixed mutations in the
         simulation: if not, we cannot ask about their frequency once fixed.
         (Either way, they will remain in the tree sequence).
     :vartype convert_to_substitution: bool
+    :ivar dominance_coeff_list: Either None (the default) or a list of floats describing
+        a list of dominance coefficients, to apply to different selection coefficients
+        (see details). Cannot be specified along with dominance_coeff.
+    :vartype dominance_coeff_list: list of floats
+    :ivar dominance_coeff_breaks: Either None (the default) or a list of floats
+        describing the intervals of selection coefficient over which each of the entries
+        of dominance_coeff_list applies (see details). Must be of length one shorter than
+        dominance_coeff_list.
+    :vartype dominance_coeff_breaks: list of floats
     """
 
-    dominance_coeff = attr.ib(default=0.5, type=float)
+    dominance_coeff = attr.ib(default=None, type=float)
     distribution_type = attr.ib(default="f", type=str)
-    distribution_args = attr.ib(factory=lambda: [0], type=list)
+    distribution_args = attr.ib(
+        factory=lambda: [0], type=list, converter=_copy_converter
+    )
     convert_to_substitution = attr.ib(default=True, type=bool)
+    dominance_coeff_list = attr.ib(default=None, type=list, converter=_copy_converter)
+    dominance_coeff_breaks = attr.ib(default=None, type=list, converter=_copy_converter)
 
     def __attrs_post_init__(self):
-        if not isinstance(self.dominance_coeff, (float, int)):
-            raise ValueError("dominance_coeff must be a number.")
+        if self.dominance_coeff is None and self.dominance_coeff_list is None:
+            self.dominance_coeff = 0.5
+
+        if self.dominance_coeff is not None:
+            if (self.dominance_coeff_list is not None) or (
+                self.dominance_coeff_breaks is not None
+            ):
+                raise ValueError(
+                    "Cannot specify both dominance_coeff and dominance_coeff_list."
+                )
+            if not isinstance(self.dominance_coeff, (float, int)):
+                raise ValueError("dominance_coeff must be a number.")
+            if not np.isfinite(self.dominance_coeff):
+                raise ValueError(
+                    f"Invalid dominance coefficient {self.dominance_coeff}."
+                )
+
+        if self.dominance_coeff_list is not None:
+            # disallow the inefficient and annoying length-one case
+            if len(self.dominance_coeff_list) < 2:
+                raise ValueError("dominance_coeff_list must have at least 2 elements.")
+            for h in self.dominance_coeff_list:
+                if not isinstance(h, (float, int)):
+                    raise ValueError("dominance_coeff_list must be a list of numbers.")
+                if not np.isfinite(h):
+                    raise ValueError(f"Invalid dominance coefficient {h}.")
+            if self.dominance_coeff_breaks is None:
+                raise ValueError(
+                    "A list of dominance coefficients provided but no breaks."
+                )
+            if len(self.dominance_coeff_list) != len(self.dominance_coeff_breaks) + 1:
+                raise ValueError(
+                    "len(dominance_coeff_list) must be equal "
+                    "to len(dominance_coeff_breaks) + 1"
+                )
+            lb = -1 * np.inf
+            for b in self.dominance_coeff_breaks:
+                if not isinstance(b, (float, int)):
+                    raise ValueError(
+                        "dominance_coeff_breaks must be a list of numbers."
+                    )
+                if not np.isfinite(b):
+                    raise ValueError(f"Invalid dominance coefficient break {b}.")
+                if b < lb:
+                    raise ValueError("dominance_coeff_breaks must be nondecreasing.")
+                lb = b
 
         if not isinstance(self.distribution_type, str):
             raise ValueError("distribution_type must be str.")
@@ -69,9 +147,6 @@ def __attrs_post_init__(self):
         if not isinstance(self.convert_to_substitution, bool):
             raise ValueError("convert_to_substitution must be bool.")
 
-        if not np.isfinite(self.dominance_coeff):
-            raise ValueError(f"Invalid dominance coefficient {self.dominance_coeff}.")
-
         # To add a new distribution type: validate the
         # distribution_args here, and add unit tests.
         if self.distribution_type == "f":
@@ -143,6 +218,19 @@ def __attrs_post_init__(self):
                 f"return {sign}rlnorm(1, {logmean} + log(Q), {logsd});"
             ]
             self.distribution_type = "s"
+        elif self.distribution_type == "u":
+            # Uniform
+            if (
+                len(self.distribution_args) != 2
+                or self.distribution_args[0] > self.distribution_args[1]
+            ):
+                raise ValueError(
+                    "Uniformly-distributed sel. coefs. (distribution_type='u') "
+                    "use a (min, max) parameterisation, with min <= max."
+                )
+            umin, umax = self.distribution_args
+            self.distribution_args = [f"return runif(1, Q * {umin}, Q * {umax});"]
+            self.distribution_type = "s"
         else:
             raise ValueError(
                 f"{self.distribution_type} is not a supported distribution type."