pasmopy · himoto · Aug 8, 2021 · Jul 24, 2021 · Jul 27, 2021 · Jul 28, 2021
diff --git a/.gitignore b/.gitignore
@@ -11,7 +11,7 @@ out/
 
 # patient-specific models
 TCGA_*
-!/models/breast/TCGA_3C_AALK_01A
+!/models/colon/TCGA_4T_AA8H_01A
 
 # sensitivity array
 sensitivity_coefficients/
@@ -27,6 +27,7 @@ errout/
 
 # transcriptomic data
 transcriptomic_data/*.csv
+!/transcriptomic_data/TPM_RLE_postComBat_COAD_BREAST.csv
 GDCdata
 *.rda
 

diff --git a/README.md b/README.md
@@ -87,7 +87,7 @@ R:
 
 ### Download TCGA gene expression data (HTSeq-Counts)
 
- - Download the gene expression data of the specified sample types [(Sample Type Codes)](https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes) in the cancer type specified by `outputClinical()` or `outputSubtype()`. By running this code, you can get data of only the patients selected by `sampleSelection()`.
+ - Download the gene expression data of the specified sample types ([Sample Type Codes](https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes)) in the cancer type specified by `outputClinical()` or `outputSubtype()`. By running this code, you can get data of only the patients selected by `sampleSelection()`.
 
    ```R
    downloadTCGA(cancertype = "BRCA", 
@@ -144,9 +144,55 @@ R:
    Text2Model(os.path.join("models", "erbb_network.txt")).convert()
    ```
 
+1. Add weighting factors for each gene (prefix: `"w_"`) to [`name2idx/parameters.py`](models/breast/TCGA_3C_AALK_01A/name2idx/parameters.py)
+
+    ```python
+    from pasmopy.preprocessing import WeightingFactors
+    from biomass import Model
+
+    from models import erbb_network
+
+
+    model = Model(erbb_network.__package__).create()
+
+    gene_expression = {
+        "ErbB1": ["EGFR"],
+        "ErbB2": ["ERBB2"],
+        "ErbB3": ["ERBB3"],
+        "ErbB4": ["ERBB4"],
+        "Grb2": ["GRB2"],
+        "Shc": ["SHC1", "SHC2", "SHC3", "SHC4"],
+        "RasGAP": ["RASA1", "RASA2", "RASA3"],
+        "PI3K": ["PIK3CA", "PIK3CB", "PIK3CD", "PIK3CG"],
+        "PTEN": ["PTEN"],
+        "SOS": ["SOS1", "SOS2"],
+        "Gab1": ["GAB1"],
+        "RasGDP": ["HRAS", "KRAS", "NRAS"],
+        "Raf": ["ARAF", "BRAF", "RAF1"],
+        "MEK": ["MAP2K1", "MAP2K2"],
+        "ERK": ["MAPK1", "MAPK3"],
+        "Akt": ["AKT1", "AKT2"],
+        "PTP1B": ["PTPN1"],
+        "GSK3b": ["GSK3B"],
+        "DUSP": ["DUSP5", "DUSP6", "DUSP7"],
+        "cMyc": ["MYC"],
+    }
+
+    weighting_factors = WeightingFactors(model, gene_expression)
+    weighting_factors.add()
+    weighting_factors.set_search_bounds()
+    ```
+
 1. Rename `erbb_network/` to CCLE_name or TCGA_ID, e.g., `MCF7_BREAST` or `TCGA_3C_AALK_01A`
 
-1. Add weighting factors for each gene (prefix: `"w_"`) to [`name2idx/parameters.py`](models/breast/TCGA_3C_AALK_01A/name2idx/parameters.py)
+    ```python
+    import shutil
+
+    shutil.move(
+        os.path.join("models", "erbb_network"),
+        os.path.join("models", "breast", "TCGA_3C_AALK_01A")
+    )
+    ```
 
 1. Edit [`set_search_param.py`](models/breast/TCGA_3C_AALK_01A/set_search_param.py)
 

diff --git a/drug_response/README.md b/drug_response/README.md
@@ -15,13 +15,43 @@
 - Barretina, J., Caponigro, G., Stransky, N. _et al_. The Cancer Cell Line Encyclopedia enables predictive modelling of anticancer drug sensitivity. _Nature_ **483**, 603–607 (2012). https://doi.org/10.1038/nature11003
 
 ## Usage
-
-1. Calculation of the ErbB receptor expression ratio
+1. Prepare sample.txt and gene.txt, which contain the names of the samples and genes, respectively.
+1. Open R
 
    ```bash
-   $ Rscript data/calc_erbb_ratio.R
+   $ R
+   ```  
+
+1. Load CCLE_normalization.R
+
+   ```R
+   source("calc_erbb_ratio.R")
+   ```
+
+1. Read sample.txt and gene.txt  
+If you want data for a specific samples or genes, you will need to create a list of those samples as `sample.txt` or `gene.txt` and read it here. 
+   ```R
+   gene <- scan("gene.txt", what="character")
+   #sample <- scan("sample.txt", what="character")
    ```
 
+1. Create TPM/RLE normalized RNA-seq data matrix of selected samples and genes
+   ```R
+   CCLEnormalization(gene, sample = NULL)
+   ```
+   Output: CCLE_normalized.csv (TPM/RLE normalized RNA-seq data for specific samples or genes)
+
+1. Calculate receotor ratio  
+Calculate EGFR/(ERBB2+ERBB3+ERBB4) using CCLE_normalized.csv.  
+If you want to run this coad, you need to prepare gene.txt containing the names of these 4 genes.
+   ```R
+   CCLE_normalized <- read.csv("CCLE_normalized.csv", row.names = 1)
+   receptor_ratio(data = CCLE_normalized, num = 30)
+   ```
+   Output: "ErbB_expression_ratio.csv"
+
+
+
 1. Drug response analysis and visualization
 
    ```python

diff --git a/drug_response/data/calc_erbb_ratio.R b/drug_response/data/calc_erbb_ratio.R
@@ -1,3 +1,4 @@
+
 suppressPackageStartupMessages(library(dplyr))
 suppressPackageStartupMessages(library(biomaRt))
 suppressPackageStartupMessages(library(edgeR))
@@ -6,78 +7,92 @@ suppressPackageStartupMessages(library(stringr))
 
 
 
-
-# download counts file
-# https://data.broadinstitute.org/ccle/CCLE_RNAseq_genes_counts_20180929.gct.gz
-url <- "https://data.broadinstitute.org/ccle/CCLE_RNAseq_genes_counts_20180929.gct.gz"
-tmp <- tempfile()
-download.file(url, tmp)
-CCLE_counts <- read.table(gzfile(tmp), skip = 2, header = T)
-CCLE_counts[,1] <- str_sub(CCLE_counts[,1], start = 1, end = 15)
-
-
-
-# get RNA length 
-dataset="hsapiens_gene_ensembl"
-mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = dataset)
-gene.annotations <- biomaRt::getBM(mart = mart, attributes=c("ensembl_gene_id", "start_position", "end_position"))
-gene.annotations <- dplyr::transmute(gene.annotations, ensembl_gene_id, length = end_position - start_position)                        
-
-
-# merge count file with length by ensembl_gene_id_version
-colnames(CCLE_counts)[1] <- "ensembl_gene_id"
-countfile_length <- data.frame(merge(gene.annotations,CCLE_counts))
-
-
-
-# data normalization
-counts_tpm <- (countfile_length[,4:ncol(countfile_length)]/countfile_length$length) *1000
-counts_tpm <- cbind(Geneid = countfile_length$ensembl_gene_id, counts_tpm) %>% as_tibble()
-normfactor <- DGEList(counts = counts_tpm[,2:ncol(counts_tpm)],
-                      group = colnames(counts_tpm[,2:ncol(counts_tpm)]))
-normfactor <- calcNormFactors(normfactor, method="RLE")
-normfactor_samples <- normfactor$samples
-normfactor_samples$normlib <- normfactor_samples$lib.size*normfactor_samples$norm.factors
-
-
-for(i in 1:(dim(counts_tpm)[2]-1)){
-  counts_tpm[,i+1] <- (counts_tpm[,i+1]/normfactor_samples$normlib[i])*1000000
+# CCLE normalization ------------------------------------------------------
+
+CCLEnormalization <- function(gene, sample = NULL){
+  # download CCLE RNA-seq counts data
+  url <- "https://data.broadinstitute.org/ccle/CCLE_RNAseq_genes_counts_20180929.gct.gz"
+  tmp <- tempfile()
+  download.file(url, tmp)
+  CCLE_counts <- read.table(gzfile(tmp), skip = 2, header = T)
+  CCLE_counts[,1] <- str_sub(CCLE_counts[,1], start=1, end=15)
+  colnames(CCLE_counts)[1] <- "ensembl_gene_id"
+
+  # get gene length
+  mart <- useEnsembl(biomart = "ensembl", 
+                     dataset = "hsapiens_gene_ensembl", 
+                     mirror = "asia")
+  gene.annotations <- biomaRt::getBM(mart=mart,
+                                     attributes=c(
+                                       "ensembl_gene_id", "start_position", "end_position"))
+  gene.annotations <- dplyr::transmute(gene.annotations, ensembl_gene_id, 
+                                       length = end_position - start_position)    
+
+  # merge count file with length by ensembl_gene_id_version
+  countsfile_length <- inner_join(
+    gene.annotations,
+    CCLE_counts,
+    by = "ensembl_gene_id"
+  )
+
+  # data normalization
+  counts_tpm <- (countsfile_length[,4:ncol(countsfile_length)]/countsfile_length$length)*1000
+  counts_tpm <- cbind(Description=countsfile_length$Description, counts_tpm) %>% as_tibble()
+  normfactor <- DGEList(counts=counts_tpm[,2:ncol(counts_tpm)],
+                        group=colnames(counts_tpm[,2:ncol(counts_tpm)]))
+  normfactor <- calcNormFactors(normfactor, method="RLE")
+  normfactor_samples <- normfactor$samples
+  normfactor_samples$normlib <- normfactor_samples$lib.size*normfactor_samples$norm.factors
+
+  for(i in 1:(dim(counts_tpm)[2]-1)){
+    counts_tpm[,i+1] <- (counts_tpm[,i+1]/normfactor_samples$normlib[i])*1000000
+  } 
+
+  if(is.null(sample)){
+    counts_tpm_selected <- data.frame(Description = counts_tpm[counts_tpm$Description %in% gene,]$Description,
+                                      counts_tpm[counts_tpm$Description %in% gene,-1]) %>% 
+      column_to_rownames(var = "Description") %>% t() 
+  } else {
+    counts_tpm_selected <- data.frame(Description = counts_tpm[counts_tpm$Description %in% gene,]$Description,
+                                      counts_tpm[counts_tpm$Description %in% gene ,colnames(counts_tpm) %in% sample]) %>% 
+      column_to_rownames(var = "Description") %>% t() 
+  }
+  write.csv(counts_tpm_selected, "CCLE_normalized.csv")
 }
-counts_tpm[,1] <- countfile_length$Description
-
-
-# log2(TPM+1) calculation
-CCLE_symbol <- counts_tpm %>% distinct(Geneid, .keep_all=T) %>% column_to_rownames("Geneid") %>% t() %>% as.data.frame()
-CCLE_symbol <- log2(CCLE_symbol + 1)
-
-
-# receptor gene expression
-CCLE_receptor <- data.frame(EGFR = CCLE_symbol$EGFR,
-                            ERBB234 = CCLE_symbol$ERBB2 + CCLE_symbol$ERBB3 + CCLE_symbol$ERBB4,
-                            ratio = CCLE_symbol$EGFR/(CCLE_symbol$ERBB2 + CCLE_symbol$ERBB3 + CCLE_symbol$ERBB4),
-                            row.names = rownames(CCLE_symbol))
-
-
-
-
-# download drug sensitivity file
-# https://data.broadinstitute.org/ccle_legacy_data/pharmacological_profiling/CCLE_NP24.2009_Drug_data_2015.02.24.csv
-url_drug <- "https://data.broadinstitute.org/ccle_legacy_data/pharmacological_profiling/CCLE_NP24.2009_Drug_data_2015.02.24.csv"
-tmp_drug <- tempfile()
-download.file(url_drug, tmp_drug)
-CCLE_drug <- read.table(gzfile(tmp_drug), header = T, sep = ",")
-
-drug_cellline <- unique(CCLE_drug$CCLE.Cell.Line.Name)
-
-CCLE_receptor_drug <- CCLE_receptor[rownames(CCLE_receptor) %in% drug_cellline,]
-CCLE_receptor_drug_median <- CCLE_receptor_drug[CCLE_receptor_drug$EGFR > median(CCLE_receptor_drug$EGFR),]
-CCLE_receptor_order <- CCLE_receptor_drug_median[order(CCLE_receptor_drug_median$ratio, decreasing = T),]
-
-
-CCLE_receptor_order$value <- c(rep("high", 30),
-                               rep("middle",nrow(CCLE_receptor_order) - 2*30),
-                               rep("low", 30))
 
+CCLEnormalization(gene = gene)
+
+
+
+# calculate receptor ratio ------------------------------------------------
+
+
+receptor_ratio <- function(data, num){
+  data <- log2(data + 1)
+  CCLE_receptor <- data.frame(EGFR = data$EGFR,
+                              ERBB234 = data$ERBB2 + data$ERBB3 + data$ERBB4,
+                              ratio = data$EGFR/(data$ERBB2 + data$ERBB3 + data$ERBB4),
+                              row.names = rownames(data))
+
+  #filtering cell-lines (only cell-lines in CCLE drug response data)
+  url_drug <- "https://data.broadinstitute.org/ccle_legacy_data/pharmacological_profiling/CCLE_NP24.2009_Drug_data_2015.02.24.csv"
+  tmp_drug <- tempfile()
+  download.file(url_drug, tmp_drug)
+  CCLE_drug <- read.table(gzfile(tmp_drug), header = T, sep = ",")
+
+  drug_cellline <- unique(CCLE_drug$CCLE.Cell.Line.Name)
+
+  #filtering cell-lines (only cell-lines whose EGFR expression level is higher than the median expression level of EGFR in all cell-lines)
+  CCLE_receptor_drug <- CCLE_receptor[rownames(CCLE_receptor) %in% drug_cellline,]
+  CCLE_receptor_drug_median <- CCLE_receptor_drug[CCLE_receptor_drug$EGFR > median(CCLE_receptor_drug$EGFR),]
+  CCLE_receptor_order <- CCLE_receptor_drug_median[order(CCLE_receptor_drug_median$ratio, decreasing = T),]
+
+  #evaluation of cell-lines
+  CCLE_receptor_order$value <- c(rep("high", num),
+                                 rep("middle",nrow(CCLE_receptor_order) - 2*num),
+                                 rep("low", num))
+
+  write.csv(CCLE_receptor_order, "ErbB_expression_ratio.csv")
+}
 
 
-write.csv(CCLE_receptor_order, file.path("data", "ErbB_expression_ratio.csv"))
diff --git a/drug_response/data/gene.txt b/drug_response/data/gene.txt
@@ -0,0 +1,4 @@
+EGFR
+ERBB2
+ERBB3
+ERBB4