forked from Zheng206/ComBatFam_Pipeline
-
Notifications
You must be signed in to change notification settings - Fork 1
/
post_CLI.R
159 lines (141 loc) · 7.91 KB
/
post_CLI.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
suppressMessages(library(argparser))
suppressMessages(library(tidyverse))
suppressMessages(library(readxl))
suppressMessages(library(ComBatFamQC))
suppressMessages(library(parallel))
## Read in arguments
p <- arg_parser("Post-harmonization processing step", hide.opts = FALSE)
p <- add_argument(p, "data", help = "path to the CSV or EXCEL file that contains data to be harmonized, covariates and batch information")
p <- add_argument(p, "--type", short = '-t', help = "post-harmonization processing type, eg: residual or age_trend", default = "age_trend")
p <- add_argument(p, "--features", short = '-f', help = "position of features/rois data(column numbers), eg: 1-5,9")
p <- add_argument(p, "--AGE", help = "column position of the age variable")
p <- add_argument(p, "--SEX", help = "column position of the sex variable")
p <- add_argument(p, "--ICV", help = "column position of the ICV variable")
p <- add_argument(p, "--Female", help = "female indicator, the value represent female in sex column.")
p <- add_argument(p, "--lowerquantile", short = '-l', help = "Specify a lower bound quantile. eg: 0.05, 0.25.", default = 0.25)
p <- add_argument(p, "--upperquantile", short = '-u', help = "Specify a upper bound quantile. eg: 0.75, 0.95.", default = 0.75)
p <- add_argument(p, "--Mu", help = "An indicator of whether to smooth age variable, include it as a linear term or only include the intercept in the mu formula. smooth: y ~ pb(age), linear: y ~ age, default: y ~ 1.", default = "smooth")
p <- add_argument(p, "--Sigma", help = "An indicator of whether to smooth age variable, include it as a linear term or only include the intercept in the sigma formula. smooth: ~ pb(age), linear: ~ age, default: ~ 1.", default = "smooth")
p <- add_argument(p, "--Nu", help = "An indicator of whether to smooth age variable, include it as a linear term or only include the intercept in the nu formula. smooth: ~ pb(age), linear: ~ age, default: ~ 1.", default = "default")
p <- add_argument(p, "--Tau", help = "An indicator of whether to smooth age variable, include it as a linear term or only include the intercept in the tau formula. smooth: ~ pb(age), linear: ~ age, default: ~ 1.", default = "default")
p <- add_argument(p, "--covariates", short = '-c', help = "position of covariates (column numbers)", default = "NULL")
p <- add_argument(p, "--model", short = '-m', help = "select the model function for harmonization, eg: lm, gam", default = "lm")
p <- add_argument(p, "--smooth", short = '-s', help = "provide the variables that require a smooth function", default = "NULL")
p <- add_argument(p, "--interaction", help = "specify the interaction terms in the format of col_index1*col_index2, eg 2*3,11*12", default = "NULL")
p <- add_argument(p, "--int_type", help = "specify an interaction type for gam models, eg: linear, factor-smooth, tensor", default = "linear")
p <- add_argument(p, "--random", short = '-r', help = "specify the random intercept-effects", default = "NULL")
p <- add_argument(p, "--rm", help = "specify the covariates (column numbers) to remove effects from, eg: 1-5,9", default = "NULL")
p <- add_argument(p, "--exist.model", help = "A boolean variable indicating whether an existing model is to be used", default = FALSE)
p <- add_argument(p, "--model.path", help = "path to the existing model", default = "NULL")
p <- add_argument(p, "--outdir", short = '-o', help = "full path (including the file name) where residual data should be written")
p <- add_argument(p, "--mout", help = "full path where regression models to be saved")
p <- add_argument(p, "--cores", help = "number of cores used for paralleling computing, please provide a numeric value", default = "all")
argv <- parse_args(p)
# Preprocess inputs
message('Checking inputs...')
if(is.na(argv$data)) stop("Missing input data") else {
if(!grepl("csv$|xls$", argv$data)) stop("Input file must be a csv or an excel file") else {
if(grepl("csv$", argv$data)) df = read.csv(argv$data) else df = read_excel(argv$data)
}
}
df = data.frame(df)
if(is.na(argv$features)) stop("Please identify the position of features/rois.") else {
col = gsub("-",":",argv$features)
col_vec = eval(parse(text = paste0("c(", col, ")")))
}
features = colnames(df[col_vec])
if (argv$cores == "all"){
cores = detectCores()
}else{
cores = as.numeric(argv$cores)
}
if(argv$type == "age_trend"){
message("Start....")
age = colnames(df)[as.numeric(argv$AGE)]
sex = colnames(df)[as.numeric(argv$SEX)]
icv = colnames(df)[as.numeric(argv$ICV)]
print(paste0(c(age, sex, icv)))
df[[sex]] = as.factor(df[[sex]])
df[[sex]] = sapply(df[[sex]], function(x){
if(x == argv$Female){return("F")}else{return("M")}
}, USE.NAMES = FALSE)
message("Create sub_df(s)....")
# Create sub_df for different features
df_var = paste0("sub_df_", 1:length(features))
for (i in 1:length(features)){
sub_df = df[,c(features[i], age, sex, icv)] %>% na.omit()
colnames(sub_df) = c("y", "age", "sex", "icv")
assign(df_var[i], sub_df)
}
# Create age_list
age_list = mclapply(1:length(features), function(w){
age_sub = age_list_gen (sub_df = eval(parse(text = paste0("sub_df_",w))), lq = as.numeric(argv$lowerquantile), hq = as.numeric(argv$upperquantile), mu = argv$Mu, sigma = argv$Sigma, nu = argv$Nu, tau = argv$Tau)
return(age_sub)
}, mc.cores = cores)
names(age_list) = features
quantile_type = c(paste0("quantile_", 100*as.numeric(argv$lowerquantile)), "median", paste0("quantile_", 100*as.numeric(argv$upperquantile)))
ComBatFamQC::age_shiny(age_list, features, quantile_type)
}else if(argv$type == "residual"){
if(argv$covariates == "NULL") {
cov_col = NULL
covariates = NULL
} else {
cov_col = gsub("-",":",argv$covariates)
cov_col = eval(parse(text = paste0("c(", cov_col, ")")))
covariates = colnames(df)[cov_col]
}
if(argv$model == "gam"){
if(argv$covariates == "NULL") stop("Please provide covariates for gam model")
if(argv$smooth == "NULL") stop("Please provide variables that require a smoothing function") else {
smooth_col = gsub("-",":",argv$smooth)
smooth_col = eval(parse(text = paste0("c(", smooth_col, ")")))
smooth_var = colnames(df)[smooth_col]
smooth = smooth_var
}
}else{
smooth = eval(parse(text = argv$smooth))
}
if(argv$model == "lmer"){
if(argv$random == "NULL") stop("Please specify random intercept-effects") else {
random_col = gsub("-",":",argv$random)
random_col = eval(parse(text = paste0("c(", random_col, ")")))
random_var = colnames(df)[random_col]
random = random_var
}
}else{
random_col = eval(parse(text = argv$random))
random = eval(parse(text = argv$random))
}
## Interaction Wranggling
if(argv$interaction == "NULL"){
interaction = eval(parse(text = argv$interaction))
smooth_int_type = NULL
}else{
interaction_l = lapply(str_split(argv$interaction, ",")[[1]], function(x) str_split(x, "\\*")[[1]])
interaction = sapply(interaction_l, function(x){
x1 = colnames(df)[as.numeric(x[1])]
x2 = colnames(df)[as.numeric(x[2])]
element = paste0(x1, ",", x2)
}, USE.NAMES = FALSE)
smooth_int_type = str_split(argv$int_type, ",")[[1]]
}
if(argv$rm == "NULL"){
rm = NULL
}else{
rm_col = gsub("-",":",argv$rm)
rm_col = eval(parse(text = paste0("c(", rm_col, ")")))
rm = colnames(df)[rm_col]
}
# Generate residuals
result = residual_gen(type = argv$model, features = features, covariates = covariates, interaction = interaction, smooth = smooth, smooth_int_type = smooth_int_type, random = random, df = df, rm = rm, model = argv$exist.model, model_path = argv$model.path, cores = cores)
if(!is.na(argv$outdir)){
message("Saving residual data......")
write_csv(result$residual, argv$outdir)
message(sprintf("Results saved at %s", argv$outdir))
}
if(!is.na(argv$mout)){
message("Saving model......")
saveRDS(result$model, argv$mout)
message(sprintf("Model saved at %s", argv$mout))
}
}