Skip to content

Commit

Permalink
Merge pull request #3 from dyaus-dev/transcriptome
Browse files Browse the repository at this point in the history
Add R script for integrating TCGA and CCLE data
  • Loading branch information
Hiroaki Imoto committed Apr 28, 2021
2 parents 7416d7a + 1f92b90 commit a31a4df
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 26 deletions.
31 changes: 6 additions & 25 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ errout/

# transcriptomic data
transcriptomic_data/*.csv
GDCdata
*.rda

# Python ---

Expand Down Expand Up @@ -58,30 +60,9 @@ share/python-wheels/
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
Expand Down Expand Up @@ -131,12 +112,12 @@ vignettes/*.pdf
.httr-oauth

# knitr and R markdown default cache directories
*_cache/
/cache/
*_cache/
/cache/

# Temporary files created by R markdown
*.utf8.md
*.knit.md
*.utf8.md
*.knit.md

# R Environment Variables
.Renviron
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,10 @@ Workflow for classifying breast cancer subtypes based on intracellular signaling

## Integration of TCGA and CCLE data

[TODO] Write analysis procedure here.
```bash
$ cd transcriptomic_data
$ Rscript transcriptomic_data_integration.R
```

## Individualization of the mechanistic model

Expand Down
Empty file removed integration/.gitkeep
Empty file.
162 changes: 162 additions & 0 deletions transcriptomic_data/transcriptomic_data_integration.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
library(dplyr)
library(stringr)
library(tibble)

# TCGA ---------------------------------------------------------------------------------------

# download TCGA data from TCGA biolinks -----------------------------------
# https://bioconductor.org/packages/release/bioc/manuals/TCGAbiolinks/man/TCGAbiolinks.pdf

library(TCGAbiolinks)

# download counts file
query <- GDCquery(project = "TCGA-BRCA",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts")
results <- getResults(query)

GDCdownload(query)
df <- GDCprepare(query, save = T, save.filename = "TCGABRCA_counts.rda",
summarizedExperiment = F)

# download clinical data
BRCA_path_subtypes <- TCGAquery_subtype(tumor = "brca")
BRCA_path_subtypes_age_stage <- data.frame(patient = BRCA_path_subtypes$patient,
stage = BRCA_path_subtypes$pathologic_stage,
subtype = BRCA_path_subtypes$BRCA_Subtype_PAM50,
age = BRCA_path_subtypes$age_at_initial_pathologic_diagnosis)



# exclude columns other than counts data ----------------------------------
df_counts <- df[grep("ENSG", df$X1),]


# exclude some samples ----------------------------------------------------
# exclude normal(non-cancerous) and duplicated samples
# https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/
# https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes
pattern <- c("-11A-", "-11B-", "-01B-", "-01C-")
TCGA_counts <- df_counts[,-grep(paste(pattern, collapse = "|"), colnames(df_counts))]
colnames(TCGA_counts) <- str_sub(colnames(TCGA_counts), start = 1, end = 16)
TCGA_counts <- TCGA_counts[,!duplicated(colnames(TCGA_counts))] #exclude duplicated samples
TCGA_counts$X1 <- str_sub(TCGA_counts$X1, start = 1, end = 15)


# screen samples with patient's age and pathologic stage ------------------
# patients's age <= 59 & pathological stage <- stage I and II
BRCA_path_subtypes_age_stage_filtered <- BRCA_path_subtypes_age_stage[BRCA_path_subtypes_age_stage$age <= 59 &
BRCA_path_subtypes_age_stage$stage %in% c("Stage_I", "Stage_II"),]
BRCA_path_subtypes_age_stage_filtered$patient <- gsub("-", ".", BRCA_path_subtypes_age_stage_filtered$patient)

# make counts file with filtered patients ---------------------------------
TCGA_counts_filtered <- data.frame(Name = TCGA_counts$X1,
TCGA_counts[,grep(paste(BRCA_path_subtypes_age_stage_filtered$patient, collapse = "|"),
colnames(TCGA_counts))])


# CCLE ---------------------------------------------------------------------------------------

# download counts file
# https://data.broadinstitute.org/ccle/CCLE_RNAseq_genes_counts_20180929.gct.gz
url <- "https://data.broadinstitute.org/ccle/CCLE_RNAseq_genes_counts_20180929.gct.gz"
tmp <- tempfile()
download.file(url, tmp)
CCLE_counts <- read.table(gzfile(tmp), skip = 2, header = T)

# get only cell lines of breast cancer from CCLE datasets
CCLE_BRCA <- data.frame(Name = CCLE_counts$Name,
Description = CCLE_counts$Description,
CCLE_counts[,grep("BREAST", colnames(CCLE_counts))])
CCLE_BRCA[,1] <- str_sub(CCLE_BRCA[,1], start = 1, end = 15)




# Merge TCGA and CCLE ------------------------------------------------------------------------

TCGA_CCLE_counts <- inner_join(CCLE_BRCA, TCGA_counts_filtered,
by = "Name")
TCGA_CCLE_counts <- TCGA_CCLE_counts %>%
dplyr::distinct(Description, .keep_all=T) %>% column_to_rownames(var = "Name")
TCGA_CCLE_counts_numeric <- sapply(TCGA_CCLE_counts[,-1], as.numeric)
rownames(TCGA_CCLE_counts_numeric) <- rownames(TCGA_CCLE_counts)




# ComBat-seq ---------------------------------------------------------------------------------
# https://academic.oup.com/nargab/article/2/3/lqaa078/5909519
# https://rdrr.io/bioc/sva/man/ComBat_seq.html
# make batch file for ComBat-seq

library(sva)

batch <- data.frame(sample = colnames(TCGA_CCLE_counts_numeric),
batch = c(rep(1,length(grep("BREAST", colnames(TCGA_CCLE_counts_numeric)))),
rep(2,length(grep("TCGA", colnames(TCGA_CCLE_counts_numeric))))))
TCGA_CCLE_counts_matrix <- data.matrix(TCGA_CCLE_counts_numeric)
postComBat <- ComBat_seq(TCGA_CCLE_counts_matrix, batch = batch$batch, group = NULL)
postComBat_ensembl <- data.frame(Name = rownames(postComBat),
Description = TCGA_CCLE_counts$Description,
postComBat)



# Normalization ------------------------------------------------------------------------------

# get gene length for calculating TPM
library(biomaRt)

mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl")
listAttributes(mart)

gene.annotations <- biomaRt::getBM(mart = mart, attributes=c("ensembl_gene_id", "start_position", "end_position"))
gene.annotations <- dplyr::transmute(gene.annotations, ensembl_gene_id, length = end_position - start_position)

# excluding samples with total read counts below 40000000 and above 140000000
allread <- apply (postComBat_ensembl[,-c(1:2)],2,sum)
allread_sub <- allread[allread >= 40000000 & allread <= 140000000]


postComBat_sub_ensembl <- data.frame(ensembl_gene_id = postComBat_ensembl$Name,
Description = postComBat_ensembl$Description,
postComBat_ensembl[,colnames(postComBat_ensembl) %in% names(allread_sub)])

countfile_length <- inner_join(gene.annotations,
postComBat_sub_ensembl,
by = "ensembl_gene_id")



# Count TPM ---------------------------------------------------------------

library(edgeR)

counts_tpm <- (countfile_length[,4:ncol(countfile_length)]/countfile_length$length) *1000
counts_tpm <- cbind(ensembl_gene_id = countfile_length$ensembl_gene_id, counts_tpm) %>% as_tibble()

normfactor <- DGEList(counts = counts_tpm[,2:ncol(counts_tpm)],
group = colnames(counts_tpm[,2:ncol(counts_tpm)]))
normfactor <- calcNormFactors(normfactor, method="RLE")
normfactor_samples <- normfactor$samples
normfactor_samples$normlib <- normfactor_samples$lib.size*normfactor_samples$norm.factors


for(i in 1:(dim(counts_tpm)[2]-1)){
counts_tpm[,i+1] <- (counts_tpm[,i+1]/normfactor_samples$normlib[i])*1000000
}
ensembl_symbol <- countfile_length[,c(1,3)]
counts_tpm_symbol <- inner_join(ensembl_symbol,
counts_tpm,
by = "ensembl_gene_id")

write.csv(counts_tpm_symbol, "TPM_RLE_postComBat.csv")


# counts_tpm_symbol_log2 <- data.frame(ensembl_gene_id = counts_tpm_symbol$ensembl_gene_id,
# Description = counts_tpm_symbol$Description,
# log2(counts_tpm_symbol[,-c(1,2)] + 1))

# write.csv(counts_tpm_symbol_log2, "TPM_RLE_log2_postComBat.csv")

0 comments on commit a31a4df

Please sign in to comment.