Merge pull request #4 from pasmopy/develop

Add R script for visualizing breast cancer classification
pasmopy · May 10, 2021 · cc3d8a3 · cc3d8a3
2 parents a31a4df + a8b1ad5
commit cc3d8a3
Show file tree

Hide file tree

Showing 17 changed files with 557 additions and 254 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7, 3.8]
+        python-version: [3.8]
 
     steps:
     - uses: actions/checkout@v2

diff --git a/.gitignore b/.gitignore
@@ -30,6 +30,10 @@ transcriptomic_data/*.csv
 GDCdata
 *.rda
 
+# patient classification
+classification/*.csv
+subtype_classification.pdf
+
 # Python ---
 
 # Byte-compiled / optimized / DLL files

diff --git a/README.md b/README.md
@@ -1,50 +1,59 @@
 # breast_cancer
 
-[![Actions Status](https://github.com/dyaus-dev/breast_cancer/workflows/Tests/badge.svg)](https://github.com/dyaus-dev/breast_cancer/actions)
-[![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](https://github.com/dyaus-dev/breast_cancer/blob/master/LICENSE)
+[![Actions Status](https://github.com/pasmopy/breast_cancer/workflows/Tests/badge.svg)](https://github.com/pasmopy/breast_cancer/actions)
+[![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](https://opensource.org/licenses/Apache-2.0)
 
 Workflow for classifying breast cancer subtypes based on intracellular signaling dynamics.
 
 ## Requirements
 
-| Language      | Dependent packages                                 |
-| ------------- | -------------------------------------------------- |
-| Python >= 3.7 | [dyaus](https://github.com/dyaus-dev/dyaus)        |
-| Julia >= 1.5  | [BioMASS.jl](https://github.com/himoto/BioMASS.jl) |
-| R >= 4.0      | TCGAbiolinks, sva, biomaRt, edgeR                  |
+| Language      | Dependent packages                                             |
+| ------------- | -------------------------------------------------------------- |
+| Python >= 3.7 | [pasmopy](https://github.com/pasmopy/pasmopy)                  |
+| Julia >= 1.5  | [BioMASS.jl](https://github.com/himoto/BioMASS.jl)             |
+| R >= 4.0      | TCGAbiolinks, sva, biomaRt, edgeR, ComplexHeatmap, viridisLite |
 
 ## Table of contents
 
-- [Construction](#construction-of-a-comprehensive-model-of-the-ErbB-signaling-network)
-
 - [Integration](#integration-of-tcga-and-ccle-data)
 
+- [Construction](#construction-of-a-comprehensive-model-of-the-ErbB-signaling-network)
+
 - [Individualization](#individualization-of-the-mechanistic-model)
 
 - [Classification](#subtype-classification-based-on-the-ErbB-signaling-dynamics)
 
+## Integration of TCGA and CCLE data
+
+- Run [`transcriptomic_data_integration.R`](transcriptomic_data/transcriptomic_data_integration.R)
+
+  ```bash
+  $ cd transcriptomic_data
+  $ Rscript transcriptomic_data_integration.R
+  ```
+
 ## Construction of a comprehensive model of the ErbB signaling network
 
-1. Use `dyaus.Text2Model` to build a mechanistic model
+1. Use `pasmopy.Text2Model` to build a mechanistic model
 
    ```python
-   from dyaus import Text2Model
+   from pasmopy import Text2Model
 
    Text2Model("models/erbb_network.txt").to_biomass_model()
    ```
 
-1. Rename **erbb_network/** to CCLE_name or TCGA_ID, e.g., **MCF7_BREAST** or **TCGA_3C_AALK_01A**
+1. Rename `erbb_network/` to CCLE_name or TCGA_ID, e.g., `MCF7_BREAST` or `TCGA_3C_AALK_01A`
 
-1. Add weighting factors for each gene (prefix: `"w_"`) to **name2idx/parameters.py**
+1. Add weighting factors for each gene (prefix: `"w_"`) to [`name2idx/parameters.py`](models/breast/TCGA_3C_AALK_01A/name2idx/parameters.py)
 
-1. Edit **set_search_param.py**
+1. Edit [`set_search_param.py`](models/breast/TCGA_3C_AALK_01A/set_search_param.py)
 
    ```python
    import os
 
    import numpy as np
 
-   from dyaus import Individualization
+   from pasmopy import Individualization
 
    from . import __path__
    from .name2idx import C, V
@@ -54,7 +63,7 @@ Workflow for classifying breast cancer subtypes based on intracellular signaling
        parameters=C.NAMES,
        species=V.NAMES,
        tpm_values="transcriptomic_data/TPM_RLE_postComBat.csv",
-       structure={
+       gene_expression={
            "ErbB1": ["EGFR"],
            "ErbB2": ["ERBB2"],
            "ErbB3": ["ERBB3"],
@@ -76,6 +85,7 @@ Workflow for classifying breast cancer subtypes based on intracellular signaling
            "DUSP": ["DUSP5", "DUSP6", "DUSP7"],
            "cMyc": ["MYC"],
        },
+       read_csv_kws={"index_col": "Description"}
    )
 
    ...
@@ -104,30 +114,23 @@ Workflow for classifying breast cancer subtypes based on intracellular signaling
        ...
    ```
 
-## Integration of TCGA and CCLE data
-
-```bash
-$ cd transcriptomic_data
-$ Rscript transcriptomic_data_integration.R
-```
-
 ## Individualization of the mechanistic model
 
 ### Use time-course datasets to train kinetic constants and weighting factors
 
 1. Build a mechanistic model for parameter estimation with BioMASS.jl
 
    ```python
-   from dyaus import Text2Model
+   from pasmopy import Text2Model
 
    Text2Model("models/erbb_network.txt", lang="julia").to_biomass_model()
    ```
 
-1. Add time-series data to **experimental_data.jl**
+1. Add time-series data to [`experimental_data.jl`](training/erbb_network_jl/experimental_data.jl)
 
-1. Set an objective function to be minimized in **fitness.jl**
+1. Set an objective function to be minimized in [`fitness.jl`](training/erbb_network_jl/fitness.jl)
 
-1. Run **optimize_parallel.sh**
+1. Run [`optimize_parallel.sh`](training/optimize_parallel.sh)
 
    ```bash
    $ mv erbb_network_jl training
@@ -159,7 +162,7 @@ $ Rscript transcriptomic_data_integration.R
             f.startswith("TCGA_") or f.endswith("_BREAST")
         ):
             models.append(f)
-   # Set optimized parameter sets
+   # Set optimized parameters
    for model in models:
        shutil.copytree(
            os.path.join("training", "dat2npy", "out"),
@@ -169,33 +172,52 @@ $ Rscript transcriptomic_data_integration.R
 
 ### Execute patient-specific models
 
-1. Use `dyaus.PatientModelSimulations`
+- Use `pasmopy.PatientModelSimulations`
 
-   ```python
-   import os
-   import shutil
+  ```python
+  import os
+  import shutil
+
+  from pasmopy import PatientModelSimulations
 
-   from dyaus import PatientModelSimulations
-
-
-   with open (os.path.join("models", "breast", "sample_names.txt"), mode="r") as f:
-       TCGA_ID = f.read().splitlines()
-   # Create patient-specific models
-   for patient in TCGA_ID:
-       if patient != "TCGA_3C_AALK_01A":
-           shutil.copytree(
-               os.path.join("models", "breast", "TCGA_3C_AALK_01A"),
-               os.path.join("models", "breast", f"{patient}"),
-           )
-   # Execute patient-specific models
-   simulations = PatientModelSimulations("models.breast", TCGA_ID)
-   simulations.run()
-   ```
+
+  with open (os.path.join("models", "breast", "sample_names.txt"), mode="r") as f:
+      TCGA_ID = f.read().splitlines()
+  # Create patient-specific models
+  for patient in TCGA_ID:
+      if patient != "TCGA_3C_AALK_01A":
+          shutil.copytree(
+              os.path.join("models", "breast", "TCGA_3C_AALK_01A"),
+              os.path.join("models", "breast", f"{patient}"),
+          )
+  # Execute patient-specific models
+  simulations = PatientModelSimulations("models.breast", TCGA_ID)
+  simulations.run()
+  ```
 
 ## Subtype classification based on the ErbB signaling dynamics
 
-[TODO] Write analysis procedure here.
+1. Extract response characteristics from patient-specific simulations
+
+   ```python
+   simulations.subtyping(
+       None,
+       {
+           "Phosphorylated_Akt": {"EGF": ["max"], "HRG": ["max"]},
+           "Phosphorylated_ERK": {"EGF": ["max"], "HRG": ["max"]},
+           "Phosphorylated_c-Myc": {"EGF": ["max"], "HRG": ["max"]},
+       }
+   )
+   ```
+
+1. Visualize patient classification by executing [`brca_heatmap.R`](classification/brca_heatmap.R)
+
+   ```bash
+   $ cd classification
+   # $ Rscript brca_heatmap.R [n_cluster: int] [figsize: tuple]
+   $ Rscript brca_heatmap.R 6 8,5
+   ```
 
 ## License
 
-[Apache License 2.0](https://github.com/dyaus-dev/dyaus/blob/master/LICENSE)
+[Apache License 2.0](https://github.com/pasmopy/breast_cancer/blob/master/LICENSE)
diff --git a/classification/.gitkeep b/classification/.gitkeep
diff --git a/classification/brca_heatmap.R b/classification/brca_heatmap.R
@@ -0,0 +1,116 @@
+
+#use following packages
+suppressPackageStartupMessages(library(dplyr))
+suppressPackageStartupMessages(library(tibble))
+suppressPackageStartupMessages(library(ComplexHeatmap))
+suppressPackageStartupMessages(library(grDevices))
+suppressPackageStartupMessages(library(circlize))
+suppressPackageStartupMessages(library(TCGAbiolinks))
+suppressPackageStartupMessages(library(stringr))
+suppressPackageStartupMessages(library(viridisLite))
+suppressPackageStartupMessages(library(cluster))
+
+
+#read arguments
+args <- commandArgs(trailingOnly = TRUE)
+
+
+#read csv file of features
+path <- getwd()
+
+for(i in 1:length(list.files(path, pattern = "csv"))){
+  if (i == 1){
+    temp <- list.files(path, pattern = "csv")[i]
+    read_temp <- data.frame(lapply(temp, read.csv))
+    colnames(read_temp)[-1] <- paste(str_sub(temp, end = -5), colnames(read_temp)[-1], sep = "_")
+    dynamics_info <- read_temp
+  } else {
+    temp <- list.files(path, pattern = "csv")[i]
+    read_temp <- data.frame(lapply(temp, read.csv))
+    colnames(read_temp)[-1] <- paste(str_sub(temp, end = -5), colnames(read_temp)[-1], sep = "_")
+    dynamics_info <- cbind(dynamics_info, read_temp[,-1])
+  }
+}
+
+
+
+#read clinical information of TCGA-BRCA
+BRCA_path_subtypes <- TCGAquery_subtype(tumor = "brca")
+BRCA_path_subtypes_annotation <- data.frame(patient = gsub("-", "_", BRCA_path_subtypes$patient),
+                                            days_to_death = BRCA_path_subtypes$days_to_death,
+                                            subtype = BRCA_path_subtypes$BRCA_Subtype_PAM50)
+
+
+#fix data frame of annotation for heatmap
+annotation <- c()
+for(i in 1:nrow(dynamics_info)){
+  temp <- BRCA_path_subtypes_annotation[BRCA_path_subtypes_annotation$patient == 
+                                          str_sub(dynamics_info$Sample, end = -5)[i],]
+  annotation <- rbind(annotation, temp)
+}
+
+
+suppressWarnings(annotation$days_to_death <- as.numeric(annotation$days_to_death))
+annotation[is.na(annotation)] <- 0
+
+
+#add prognosis information
+for (i in 1:nrow(annotation)){
+  if (annotation$days_to_death[i] == 0){
+    annotation$prognosis[i] <- 20
+  } else if (annotation$days_to_death[i] < 365 & annotation$days_to_death[i] > 0) {
+    annotation$prognosis[i] <- 1
+  } else if (annotation$days_to_death[i] > 366 & annotation$days_to_death[i] < 730){
+    annotation$prognosis[i] <- 2
+  } else if (annotation$days_to_death[i] > 731 & annotation$days_to_death[i] < 1095){
+    annotation$prognosis[i] <- 3
+  } else if (annotation$days_to_death[i] > 1096 & annotation$days_to_death[i] < 1825){
+    annotation$prognosis[i] <- 4
+  } else if (annotation$days_to_death[i] > 1826 & annotation$days_to_death[i] < 2190){
+    annotation$prognosis[i] <- 5
+  } else if (annotation$days_to_death[i] > 2191 & annotation$days_to_death[i] < 2555){
+    annotation$prognosis[i] <- 6
+  } else if (annotation$days_to_death[i] > 2556 & annotation$days_to_death[i] < 2920){
+    annotation$prognosis[i] <- 7
+  } else if (annotation$days_to_death[i] > 2921 & annotation$days_to_death[i] < 3650){
+    annotation$prognosis[i] <- 8
+  } else if (annotation$days_to_death[i] > 3651 & annotation$days_to_death[i] < 4015){
+    annotation$prognosis[i] <- 9
+  } else if (annotation$days_to_death[i] > 4016 & annotation$days_to_death[i] < 4380){
+    annotation$prognosis[i] <- 10
+  } else if (annotation$days_to_death[i] > 4381){
+    annotation$prognosis[i] <- 11
+  }
+}
+
+
+#heatmap
+dynamics_info_zscore <- dynamics_info  %>% column_to_rownames(var = "Sample") %>% scale()
+km <- pam(as.matrix(dynamics_info_zscore), metric = "euclidean", k = as.numeric(args[1]))
+
+gradPal <- grDevices::colorRampPalette(rev(RColorBrewer::brewer.pal(n = 11, name = "RdBu")))(7)
+set.seed(1234)
+hm <- Heatmap(as.matrix(dynamics_info_zscore), show_row_names = F, name = "Z-score",
+              clustering_distance_rows = "euclidean",
+              clustering_method_rows = "ward.D2",
+              row_split = km$clustering,
+              na_col = "black",
+              col = colorRamp2(c(-3:3), gradPal),
+              row_dend_reorder = TRUE,
+              cluster_row_slices = F) +
+  Heatmap(annotation$prognosis, name = "Prognosis",
+          show_row_names = F,
+          width = unit(0.5,"cm"),
+          col = viridis(100)) +
+  Heatmap(annotation$subtype, name = "Subtype",
+          show_row_names = F,
+          width = unit(0.5,"cm"),
+          col = c("LumA" = "#304698", "LumB" = "#68c6ea", "Her2" = "#f489b9", "Basal" = "#e02828", "Normal" = "#00882c")) 
+
+
+#save heatmap as pdf file
+hw <- as.numeric(strsplit(args[2], ",")[[1]])
+pdf("Heatmap.pdf", height = hw[1], width = hw[2])
+hm
+invisible(dev.off())
+