From 23332edd876415ca9bbff6402e5f99c7cc661aea Mon Sep 17 00:00:00 2001 From: Matteo Delucchi Date: Sun, 30 Jun 2024 09:40:15 +0200 Subject: [PATCH] first implementation of proposed improved cache structure in #58. WIP, still errors. --- R/build_score_cache_mle.R | 597 +++++++++++++++++++------------------- 1 file changed, 292 insertions(+), 305 deletions(-) diff --git a/R/build_score_cache_mle.R b/R/build_score_cache_mle.R index 930116cd..0942dbac 100755 --- a/R/build_score_cache_mle.R +++ b/R/build_score_cache_mle.R @@ -27,36 +27,36 @@ forLoopContent <- if (verbose) { message("we have grouping (mixed-effects model) glmm") } - ### - # Build arguments for glmm.bic - ### - # current node's name - child <- mycache[["children"]][row.num] # child as integer - child.name <- colnames(mycache[["node.defn"]])[child] - # current node's parents names - parents.names <- names(which(mycache[["node.defn"]][row.num,] != 0)) - # current node's distribution - child.dist <- data.dists[child] # distribution <- data.dists[child] - # data.df including group.var=group.ids - data.df.grouping <- data.frame(data.df, grp = factor(group.ids)) - names(data.df.grouping) <- c(names(data.df), group.var) - - ### - # Beginn of GLMM - ### - # Build formula statement - if (length(parents.names) == 0){ - # no parent only random effect - model <- as.formula(paste(child.name, "~ 1 + (1|", group.var, ")")) - } else if (length(parents.names) > 0){ - model <- as.formula(paste(child.name, "~ (1|", group.var, ")+", paste(parents.names, collapse = "+"))) - } else { - stop("Cannot build model formula. Unknown predictor names of parent nodes in 'parents.name'.") - } + ### + # Build arguments for glmm.bic + ### + # current node's name + child <- mycache[["children"]][row.num] # child as integer + child.name <- colnames(mycache[["node.defn"]])[child] + # current node's parents names + parents.names <- names(which(mycache[["node.defn"]][row.num,] != 0)) + # current node's distribution + child.dist <- data.dists[child] # distribution <- data.dists[child] + # data.df including group.var=group.ids + data.df.grouping <- data.frame(data.df, grp = factor(group.ids)) + names(data.df.grouping) <- c(names(data.df), group.var) + + ### + # Beginn of GLMM + ### + # Build formula statement + if (length(parents.names) == 0){ + # no parent only random effect + model <- as.formula(paste(child.name, "~ 1 + (1|", group.var, ")")) + } else if (length(parents.names) > 0){ + model <- as.formula(paste(child.name, "~ (1|", group.var, ")+", paste(parents.names, collapse = "+"))) + } else { + stop("Cannot build model formula. Unknown predictor names of parent nodes in 'parents.name'.") + } - # Main part: Depending on child's distribution, call the appropriate modelling function - switch (as.character(child.dist), - gaussian = { + # Main part: Depending on child's distribution, call the appropriate modelling function + switch (as.character(child.dist), + gaussian = { tryCatch({ fit <- lme4::lmer(model, data = data.df.grouping) }, error=function(e)NULL) @@ -75,8 +75,8 @@ forLoopContent <- } # if fit is still NULL, try fixed effect only - }, - binomial = { + }, + binomial = { tryCatch({ fit <- lme4::glmer(model, data = data.df.grouping, family = "binomial") }, error=function(e)NULL) @@ -96,8 +96,8 @@ forLoopContent <- } # if fit is still NULL, do not modify model further as this would change the structure of the dag but return very low score (further down) - }, - poisson = { + }, + poisson = { tryCatch({ fit <- lme4::glmer(model, data = data.df.grouping, family = "poisson") }, error=function(e)NULL) @@ -117,54 +117,54 @@ forLoopContent <- } # if fit is still NULL, do not modify model further as this would change the structure of the dag but return very low score (further down) - }, - multinomial = { - if (length(parents.names) == 0){ - model_basic <- as.formula(paste(child.name, "~ 1")) - model_random <- as.formula(paste("~ 1|", group.var, sep = "")) - } else { - model_basic <- as.formula(paste(child.name, "~ ", paste(parents.names, collapse = "+"))) - model_random <- as.formula(paste("~ 1|", group.var, sep = "")) - } - if(verbose) message(paste("using mblogit with fixed term:", deparse1(model_basic), "and random term:", deparse1(model_random))) else NA - - if (control[["catcov.mblogit"]] == "free"){ - tryCatch({ - fit <- mclogit::mblogit(formula = model_basic, random = model_random, data = data.df.grouping, catCov = "free") - }, error=function(e) NULL) - } else if (control[["catcov.mblogit"]] == "diagonal"){ - tryCatch({ - fit <- mclogit::mblogit(formula = model_basic, random = model_random, data = data.df.grouping, catCov = "diagonal") + }, + multinomial = { + if (length(parents.names) == 0){ + model_basic <- as.formula(paste(child.name, "~ 1")) + model_random <- as.formula(paste("~ 1|", group.var, sep = "")) + } else { + model_basic <- as.formula(paste(child.name, "~ ", paste(parents.names, collapse = "+"))) + model_random <- as.formula(paste("~ 1|", group.var, sep = "")) + } + if(verbose) message(paste("using mblogit with fixed term:", deparse1(model_basic), "and random term:", deparse1(model_random))) else NA + + if (control[["catcov.mblogit"]] == "free"){ + tryCatch({ + fit <- mclogit::mblogit(formula = model_basic, random = model_random, data = data.df.grouping, catCov = "free") + }, error=function(e) NULL) + } else if (control[["catcov.mblogit"]] == "diagonal"){ + tryCatch({ + fit <- mclogit::mblogit(formula = model_basic, random = model_random, data = data.df.grouping, catCov = "diagonal") + # manipulate VarCov to bring in correct shape + fit_vcov <- matrix(0, nrow = length(fit$VarCov), ncol = length(fit$VarCov)) + diag(fit_vcov) <- unlist(fit$VarCov, use.names = FALSE) + # col and rownames + cn <- c() + rn <- c() + for(i in 1:length(fit$VarCov)){ + cn[i] <- colnames(fit$VarCov[[i]]) + rn[i] <- rownames(fit$VarCov[[i]]) + } + colnames(fit_vcov) <- cn + rownames(fit_vcov) <- rn + # replace + fit_vcov <- list(fit_vcov) + names(fit_vcov) <- group.var + fit$VarCov <- fit_vcov + }, error=function(e) NULL) + } else if (control[["catcov.mblogit"]] == "single"){ + stop("'catcov.mblogit' == 'single' is not yet implemented.") + fit <- mclogit::mblogit(formula = model_basic, random = model_random, data = data.df.grouping, catCov = "single") # manipulate VarCov to bring in correct shape - fit_vcov <- matrix(0, nrow = length(fit$VarCov), ncol = length(fit$VarCov)) - diag(fit_vcov) <- unlist(fit$VarCov, use.names = FALSE) - # col and rownames - cn <- c() - rn <- c() - for(i in 1:length(fit$VarCov)){ - cn[i] <- colnames(fit$VarCov[[i]]) - rn[i] <- rownames(fit$VarCov[[i]]) - } - colnames(fit_vcov) <- cn - rownames(fit_vcov) <- rn - # replace - fit_vcov <- list(fit_vcov) - names(fit_vcov) <- group.var - fit$VarCov <- fit_vcov - }, error=function(e) NULL) - } else if (control[["catcov.mblogit"]] == "single"){ - stop("'catcov.mblogit' == 'single' is not yet implemented.") - fit <- mclogit::mblogit(formula = model_basic, random = model_random, data = data.df.grouping, catCov = "single") - # manipulate VarCov to bring in correct shape - } else { - stop("invalid 'catcov.mblogit' argument. Must be one of 'free', 'diagonal' or 'single'.") - } + } else { + stop("invalid 'catcov.mblogit' argument. Must be one of 'free', 'diagonal' or 'single'.") + } if (is.null(fit)){ # relax tolerances for change in parameter values and objective function. # tryCatch({fit <- mclogit::mblogit(formula = model_basic, random = model_random, data = data.df.grouping, catCov = control[["catcov.mblogit"]] - # control = mclogit::mclogit.control(epsilon = control[["epsilon"]], - # trace = control[["trace.mblogit"]])) + # control = mclogit::mclogit.control(epsilon = control[["epsilon"]], + # trace = control[["trace.mblogit"]])) # }, error=function(e)NULL) if (control[["catcov.mblogit"]] == "free"){ tryCatch({ @@ -205,109 +205,109 @@ forLoopContent <- } } # if fit is still NULL, do not modify model further as this would change the structure of the dag but return very low score (further down) - } - ) - ### - # End of GLMM - ### - # collect values to return - if(!is.null(fit)){ - fit_loglik <- logLik(fit) - fit_aic <- AIC(fit) - fit_bic <- BIC(fit) - # fit_mdl <- fit_bic + (1 + sum(mycache[["node.defn.multi"]][row.num, ]) - num.na) * log(ncol(data.df.grouping)) # num.na: number of "unused" variables determinded in removing columns if rank deficient - fit_mdl <- fit_bic + (attributes(logLik(fit))$df-1) * log(ncol(data.df.grouping)) # logLik returns object of degrees of freedom which corresponds to the models parameters +1 - - c(fit_loglik, fit_aic, fit_bic, fit_mdl) - } else if(is.null(fit)){ - # no convergence, singularity, rank-deficiency, return very low score - c(rep(-Inf, 4)) - } else { - stop("Unknown state of fit. I should never end up here.") - } - } else if (is.null(group.var)) { - # we have no grouping (do what was always done). - child <- mycache[["children"]][row.num] - distribution <- data.dists[child] - Y <- data.matrix(data.df[, child]) - - if (is.null(adj.vars)) { - if ("multinomial" %in% data.dists[as.logical(mycache$node.defn[row.num, ])]) { - X <- data.matrix(cbind(data.df.multi[, as.logical(mycache[["node.defn.multi"]][row.num, ])])) + } + ) + ### + # End of GLMM + ### + # collect values to return + if(!is.null(fit)){ + fit_loglik <- logLik(fit) + fit_aic <- AIC(fit) + fit_bic <- BIC(fit) + # fit_mdl <- fit_bic + (1 + sum(mycache[["node.defn.multi"]][row.num, ]) - num.na) * log(ncol(data.df.grouping)) # num.na: number of "unused" variables determinded in removing columns if rank deficient + fit_mdl <- fit_bic + (attributes(logLik(fit))$df-1) * log(ncol(data.df.grouping)) # logLik returns object of degrees of freedom which corresponds to the models parameters +1 + + c(fit_loglik, fit_aic, fit_bic, fit_mdl) + } else if(is.null(fit)){ + # no convergence, singularity, rank-deficiency, return very low score + c(rep(-Inf, 4)) } else { - X <- data.matrix(cbind(rep(1, length(data.df[, 1])), data.df.multi[, as.logical(mycache[["node.defn.multi"]][row.num, ])])) + stop("Unknown state of fit. I should never end up here.") } - } else { - if ("multinomial" %in% data.dists[as.logical(mycache$node.defn.adj[row.num, ])]) { - X <- data.matrix(cbind(data.df.multi[, as.logical(mycache[["node.defn.multi"]][row.num, ])])) + } else if (is.null(group.var)) { + # we have no grouping (do what was always done). + child <- mycache[["children"]][row.num] + distribution <- data.dists[child] + Y <- data.matrix(data.df[, child]) + + if (is.null(adj.vars)) { + if ("multinomial" %in% data.dists[as.logical(mycache$node.defn[row.num, ])]) { + X <- data.matrix(cbind(data.df.multi[, as.logical(mycache[["node.defn.multi"]][row.num, ])])) + } else { + X <- data.matrix(cbind(rep(1, length(data.df[, 1])), data.df.multi[, as.logical(mycache[["node.defn.multi"]][row.num, ])])) + } } else { - X <- data.matrix(cbind(rep(1, length(data.df[, 1])), data.df.multi[, as.logical(mycache[["node.defn.multi"]][row.num, ])])) + if ("multinomial" %in% data.dists[as.logical(mycache$node.defn.adj[row.num, ])]) { + X <- data.matrix(cbind(data.df.multi[, as.logical(mycache[["node.defn.multi"]][row.num, ])])) + } else { + X <- data.matrix(cbind(rep(1, length(data.df[, 1])), data.df.multi[, as.logical(mycache[["node.defn.multi"]][row.num, ])])) + } } - } - ## Rank deficiency - num.na <- 0 + ## Rank deficiency + num.na <- 0 - R <- rank_cpp(X) - r <- ncol(X) - R_col <- R/r + R <- rank_cpp(X) + r <- ncol(X) + R_col <- R/r - if (R_col != 1 & as.character(distribution) == "binomial") { - Y1 <- if (is.factor(Y)) numeric(Y) else Y + if (R_col != 1 & as.character(distribution) == "binomial") { + Y1 <- if (is.factor(Y)) numeric(Y) else Y - while (rank_cpp(X)/ncol(X) != 1) { - X <- X[, -1] - num.na <- num.na + 1 - if (is.null(ncol(X))) - X <- as.matrix(X) - } + while (rank_cpp(X)/ncol(X) != 1) { + X <- X[, -1] + num.na <- num.na + 1 + if (is.null(ncol(X))) + X <- as.matrix(X) + } - tryCatch(fit <- irls_binomial_cpp_fast_br(A = X, b = Y1, maxit = control[["max.iters"]], tol = control[["tol"]])) - # tryCatch(fit <- irls_binomial_cpp_fast_br(A = X, b = Y1, maxit = control[["max.iters"]], tol = control[["tol"]]), - # error = function(e) { - # while (rank_cpp(X)/ncol(X) != 1) { - # X <- X[, -1] - # num.na <- num.na + 1 - # if (is.null(ncol(X))) - # X <- as.matrix(X) - # } - # fit <- irls_binomial_cpp_fast_br(A = X, b = Y1, maxit = control[["max.iters"]], tol = control[["tol"]]) - # }, finally = fit) + tryCatch(fit <- irls_binomial_cpp_fast_br(A = X, b = Y1, maxit = control[["max.iters"]], tol = control[["tol"]])) + # tryCatch(fit <- irls_binomial_cpp_fast_br(A = X, b = Y1, maxit = control[["max.iters"]], tol = control[["tol"]]), + # error = function(e) { + # while (rank_cpp(X)/ncol(X) != 1) { + # X <- X[, -1] + # num.na <- num.na + 1 + # if (is.null(ncol(X))) + # X <- as.matrix(X) + # } + # fit <- irls_binomial_cpp_fast_br(A = X, b = Y1, maxit = control[["max.iters"]], tol = control[["tol"]]) + # }, finally = fit) + } else { + switch(as.character(distribution), + binomial = { + Y1 <- if (is.factor(Y)) numeric(Y) else Y + fit <- irls_binomial_cpp_fast_br(A = X, b = Y1, maxit = control[["max.iters"]], tol = control[["tol"]]) + if (is.na(sum(fit[[1]]))) fit <- irls_binomial_cpp_fast_br(A = X, b = Y, maxit = control[["max.iters"]], tol = control[["tol"]]) + }, gaussian = { + suppressWarnings(fit <- irls_gaussian_cpp_fast(A = X, b = Y, maxit = control[["max.iters"]], tol = control[["tol"]])) + }, poisson = { + suppressWarnings(fit <- irls_poisson_cpp_fast(A = X, b = Y, maxit = control[["max.iters"]], tol = control[["tol"]])) + }, multinomial = { + + Ymulti <- data.matrix(model.matrix(~-1 + data.df.lvl[, child])) + + p <- ncol(Ymulti) + mask <- c(rep(FALSE, r + 1L), rep(c(FALSE, rep(TRUE, r)), p - 1L)) + + tmp <- nnet.default(x = X, y = Ymulti, mask = mask, size = 0, + skip = TRUE, softmax = TRUE, rang = 0, trace = FALSE) + + fit <- NULL + fit$loglik <- -tmp$value + edf <- ifelse(length(tmp$lev) == 2L, 1, length(tmp$lev) - 1) * R + fit$aic <- 2 * tmp$value + 2 * edf + fit$bic <- 2 * tmp$value + edf * log(dim(data.df)[1]) + }) + } + c(fit$loglik, + fit$aic, + fit$bic, + fit$bic + (1 + sum(mycache[["node.defn.multi"]][row.num, ]) - num.na) * log(n)) } else { - switch(as.character(distribution), - binomial = { - Y1 <- if (is.factor(Y)) numeric(Y) else Y - fit <- irls_binomial_cpp_fast_br(A = X, b = Y1, maxit = control[["max.iters"]], tol = control[["tol"]]) - if (is.na(sum(fit[[1]]))) fit <- irls_binomial_cpp_fast_br(A = X, b = Y, maxit = control[["max.iters"]], tol = control[["tol"]]) - }, gaussian = { - suppressWarnings(fit <- irls_gaussian_cpp_fast(A = X, b = Y, maxit = control[["max.iters"]], tol = control[["tol"]])) - }, poisson = { - suppressWarnings(fit <- irls_poisson_cpp_fast(A = X, b = Y, maxit = control[["max.iters"]], tol = control[["tol"]])) - }, multinomial = { - - Ymulti <- data.matrix(model.matrix(~-1 + data.df.lvl[, child])) - - p <- ncol(Ymulti) - mask <- c(rep(FALSE, r + 1L), rep(c(FALSE, rep(TRUE, r)), p - 1L)) - - tmp <- nnet.default(x = X, y = Ymulti, mask = mask, size = 0, - skip = TRUE, softmax = TRUE, rang = 0, trace = FALSE) - - fit <- NULL - fit$loglik <- -tmp$value - edf <- ifelse(length(tmp$lev) == 2L, 1, length(tmp$lev) - 1) * R - fit$aic <- 2 * tmp$value + 2 * edf - fit$bic <- 2 * tmp$value + edf * log(dim(data.df)[1]) - }) + stop("Invalid `group.var`.") } - c(fit$loglik, - fit$aic, - fit$bic, - fit$bic + (1 + sum(mycache[["node.defn.multi"]][row.num, ]) - num.na) * log(n)) - } else { - stop("Invalid `group.var`.") - } -} # End of forLoopContent() + } # End of forLoopContent() #' \code{buildScoreCache.mle} and \code{buildScoreCache.bayes} are internal functions called by \code{buildScoreCache}. #' @@ -341,8 +341,8 @@ buildScoreCache.mle <- ## which.nodes if (!is.null(which.nodes)) { - data.df <- data.df[, which.nodes] - data.dists <- data.dists[which.nodes] + data.df <- data.df[, which.nodes] + data.dists <- data.dists[which.nodes] } ## number of variable: @@ -354,120 +354,107 @@ buildScoreCache.mle <- ## standardize gaussian variables to zero mean and sd=1 have at least one gaussian variable if (centre && !is.null(data.dists == "gaussian")) { - for (i in names(data.dists)[(data.dists == "gaussian")]) { - data.df[, i] <- (data.df[, i] - mean(data.df[, i]))/sd(data.df[, i]) - } + for (i in names(data.dists)[(data.dists == "gaussian")]) { + data.df[, i] <- (data.df[, i] - mean(data.df[, i]))/sd(data.df[, i]) + } } for (i in 1:nvars) { - if (data.dists[[i]] == "binomial" & !inherits(data.df[, i], "numeric")) { - data.df[, i] <- as.numeric(factor(data.df[, i])) - 1 - } - if (data.dists[[i]] == "multinomial") { - data.df[, i] <- factor(data.df[, i]) - } + if (data.dists[[i]] == "binomial" & !inherits(data.df[, i], "numeric")) { + data.df[, i] <- as.numeric(factor(data.df[, i])) - 1 + } + if (data.dists[[i]] == "multinomial") { + data.df[, i] <- factor(data.df[, i]) + } } # adjustment: storing of df if (!is.null(adj.vars)) { - data.df.adj <- data.df - data.df <- data.df[, -adj.vars] - nvars <- nvars - length(adj.vars) + data.df.adj <- data.df + data.df <- data.df[, -adj.vars] + nvars <- nvars - length(adj.vars) } ############################## Function to create the cache + # Create node specific max. parent vector + if (length(max.parents) == 1) { + max.parents <- rep(max.parents, nvars) + } - + # Start of the cache creation if (!is.null(defn.res)) { - max.parents <- max(apply(defn.res[["node.defn"]], 1, sum)) - + # If the cache is already computed then we just need to update the max.parents + max.parents <- max(apply(defn.res[["node.defn"]], 1, sum)) } else { - ## Computing the cache - fun.return <- function(x, n) { - # x: vector or single integer - # n: number of variables/columns in data.df - # returns: vector of size n-1 with 1 for all x and zeros otherwise - v <- rep(0, n - 1) - v[x] <- 1 - return(v) - } - - node.defn <- matrix(data = as.integer(0), nrow = 1L, ncol = nvars) - children <- 1 - - for (j in 1:nvars) { - if (j != 1) { - node.defn <- rbind(node.defn, matrix(data = as.integer(0), - nrow = 1L, ncol = nvars)) - children <- cbind(children, j) - } - # node.defn <- rbind(node.defn,matrix(data = 0,nrow = 1,ncol = n)) - - if(is.list(max.parents)){ - stop("ISSUE: `max.parents` as list is not yet implemented further down here. Try with a single numeric value as max.parents instead.") - if(!is.null(which.nodes)){ - stop("ISSUE: `max.parents` as list in combination with `which.nodes` is not yet implemented further down here. Try with single numeric as max.parents instead.") - } - } else if (is.numeric(max.parents) && length(max.parents)>1){ - if (length(unique(max.parents)) == 1){ - max.parents <- unique(max.parents) - } else { - stop("ISSUE: `max.parents` with node specific values that are not all the same, is not yet implemented further down here.") - } - } + ## Computing the cache + fun.return <- function(x, n) { + # x: vector or single integer + # n: number of variables/columns in data.df + # returns: vector of size n-1 with 1 for all x and zeros otherwise + v <- rep(0, n - 1) + v[x] <- 1 + return(v) + } - if(max.parents == nvars){ - max.parents <- max.parents-1 - warning(paste("`max.par` == no. of variables. I set it to (no. of variables - 1)=", max.parents)) #NOTE: This might cause differences to method="bayes"! - } + # Generate all possible bit patterns for n variables, with a maximum of m 1s + generateBitPatterns <- function(n, m){ + z <- rep(0, n) + do.call(rbind, lapply(0:m, function(i) t(apply(combn(1:n, i), 2, function(k){ + z[k] <- 1 + return(z) + }))) + ) + } - for (i in 1:(max.parents)) { - tmp <- t(combn(x = (nvars - 1), m = i, FUN = fun.return, n = nvars, simplify = TRUE)) - tmp <- t(apply(X = tmp, MARGIN = 1, FUN = function(x) append(x = x, values = 0, after = j - 1))) + # Function to generate all possible combinations of parents + filteredCombinations <- function(x, m, bannedParents, retainedParents){ + # These are the parents that cannot change + fixedParents <- bannedParents | retainedParents | (fun.return(x, length(x) + 2) + 1) %% 2 + + # These are the parents that can change + parentPossibleChoices <- which(fixedParents == 0) + numPossibleChoices <- length(parentPossibleChoices) + numRetainedParents <- sum(retainedParents) + + # Generate all possible combinations of parents, taking account of banned, retained and maximum number of parents + parentChoices <- generateBitPatterns(numPossibleChoices, min(m-numRetainedParents, numPossibleChoices)) == 1 + output <- t(apply(parentChoices, 1, function(pc){ + combinedRow <- 1L*(retainedParents | fun.return(parentPossibleChoices[pc], length(x) + 2)) + combinedRow + })) + output + } - node.defn <- rbind(node.defn, tmp) + children <- matrix(nrow = 1, ncol = 0) + node.defn.list <- list() - # children position - children <- cbind(children, t(rep(j, length(tmp[, 1])))) - } + for (j in 1:nvars){ + if(is.list(max.parents)){ + stop("ISSUE: `max.parents` as list is not yet implemented further down here. Try with a single numeric value as max.parents instead.") + if(!is.null(which.nodes)){ + stop("ISSUE: `max.parents` as list in combination with `which.nodes` is not yet implemented further down here. Try with single numeric as max.parents instead.") + } } - # children <- rowSums(node.defn) + # The parents that are banned and retained for node j + bannedParents <- dag.banned[j, ] + retainedParents <- dag.retained[j, ] + # All possible parents for node j, which is all nodes except j + parentChoice <- c(seq.int(from = 1, length.out = j-1), seq.int(from = j+1, length.out = nvars-1)) + # How many parents we are keeping for node j + numretainedParents <- sum(retainedParents) + # The maximum number of parents for node j + m <- max.parents[j] + + # Generate all possible combinations of parents for node j + tmp <- filteredCombinations(x = parentChoice, m = m, bannedParents = bannedParents, retainedParents = retainedParents) + # We need a sparse matrix here to deal with large numbers of variables, otherwise memory usage can become very high + tmp2 <- Matrix::Matrix(tmp, sparse = TRUE) + node.defn.list[[length(node.defn.list) + 1]] <- tmp2 + children <- cbind(children, t(rep(j, length(tmp2[, 1])))) + + node.defn <- do.call(rbind, node.defn.list) colnames(node.defn) <- colnames(data.df) - ## Coerce numeric matrix into integer matrix !!! - node.defn <- apply(node.defn, c(1, 2), function(x) { - (as.integer(x)) - }) - - children <- as.integer(children) - # node.defn_ <- node.defn - - ## DAG RETAIN/BANNED - for (i in 1:nvars) { - for (j in 1:nvars) { - - ## DAG RETAIN - if (dag.retained[i, j] != 0) { - tmp.indices <- which(children == i & node.defn[, j] == 0) - - if (length(tmp.indices) != 0) { - node.defn <- node.defn[-tmp.indices, ] - children <- children[-tmp.indices] - } - } - - ## DAG BANNED - if (dag.banned[i, j] != 0) { - tmp.indices <- which(children == i & node.defn[, j] == 1) - - if (length(tmp.indices) != 0) { - node.defn <- node.defn[-tmp.indices, ] - children <- children[-tmp.indices] - } - } - - } - } mycache <- list(children = as.integer(children), node.defn = (node.defn)) @@ -477,14 +464,14 @@ buildScoreCache.mle <- ###FIXME if (is.list(max.parents)) { - for (z in 1:nvars) { - tmp <- mycache[["node.defn"]][mycache[["children"]] == z, ] - if (is.null(dim(tmp))) stop("Increase parents for node ",z," (due to retain)") - - if (any(diff(unlist(max.parents)) !=0)) - stop("For method='mle', unique number of parents required") - mycache[["node.defn"]][mycache[["children"]] == z, ] <- tmp[rowSums(tmp) <= unlist(max.parents[z]), ] - } + for (z in 1:nvars) { + tmp <- mycache[["node.defn"]][mycache[["children"]] == z, ] + if (is.null(dim(tmp))) stop("Increase parents for node ",z," (due to retain)") + + if (any(diff(unlist(max.parents)) !=0)) + stop("For method='mle', unique number of parents required") + mycache[["node.defn"]][mycache[["children"]] == z, ] <- tmp[rowSums(tmp) <= unlist(max.parents[z]), ] + } } @@ -494,24 +481,24 @@ buildScoreCache.mle <- if (!is.null(adj.vars)) { - # mycache$node.defn.adj <- mycache$node.defn + # mycache$node.defn.adj <- mycache$node.defn - ## adding adjustment column set to zero mycache$node.defn.adj <- cbind(mycache$node.defn,matrix(data = 0,nrow = dim(mycache$node.defn)[1],ncol = length(adj.vars))) - mycache$node.defn <- cbind(mycache$node.defn, matrix(data = 0, nrow = dim(mycache$node.defn)[1], ncol = length(adj.vars))) + ## adding adjustment column set to zero mycache$node.defn.adj <- cbind(mycache$node.defn,matrix(data = 0,nrow = dim(mycache$node.defn)[1],ncol = length(adj.vars))) + mycache$node.defn <- cbind(mycache$node.defn, matrix(data = 0, nrow = dim(mycache$node.defn)[1], ncol = length(adj.vars))) - if (is.null(cor.vars)) { - cor.vars <- colnames(data.df) - } + if (is.null(cor.vars)) { + cor.vars <- colnames(data.df) + } - ## adjustment variables + ## adjustment variables - mycache$node.defn[mycache$children[match(cor.vars, colnames(data.df))], dim(data.df)[2]:dim(data.df.adj)] <- 1 + mycache$node.defn[mycache$children[match(cor.vars, colnames(data.df))], dim(data.df)[2]:dim(data.df.adj)] <- 1 - ## output - colnames(mycache$node.defn) <- c(colnames(data.df), adj.vars) + ## output + colnames(mycache$node.defn) <- c(colnames(data.df), adj.vars) - mycache$node.defn <- mycache$node.defn[, names(data.df.adj)] - data.df <- data.df.adj + mycache$node.defn <- mycache$node.defn[, names(data.df.adj)] + data.df <- data.df.adj } ##---------------------- @@ -522,18 +509,18 @@ buildScoreCache.mle <- repetition.multi <- vector(length = nvars) for (i in 1:nvars) { - if (data.dists[[i]] %in% c("binomial", "poisson", "gaussian")) { - repetition.multi[i] <- 1 - } else { - repetition.multi[i] <- nlevels(data.df.lvl[, i]) - } + if (data.dists[[i]] %in% c("binomial", "poisson", "gaussian")) { + repetition.multi[i] <- 1 + } else { + repetition.multi[i] <- nlevels(data.df.lvl[, i]) + } } if (!is.null(adj.vars)) { - mycache$node.defn.multi <- mycache$node.defn.adj[, rep(1:nvars, repetition.multi)] - data.df <- data.df.adj[, colnames(mycache$node.defn.adj)] + mycache$node.defn.multi <- mycache$node.defn.adj[, rep(1:nvars, repetition.multi)] + data.df <- data.df.adj[, colnames(mycache$node.defn.adj)] } else { - mycache$node.defn.multi <- mycache$node.defn[, rep(1:nvars, repetition.multi)] + mycache$node.defn.multi <- mycache$node.defn[, rep(1:nvars, repetition.multi)] } @@ -542,21 +529,21 @@ buildScoreCache.mle <- data.df.multi <- NULL for (i in 1:nvars) { - if (data.dists[[i]] %in% c("binomial", "poisson", "gaussian")) { - data.df.multi <- as.data.frame(cbind(data.df.multi, data.df[, i])) - colnames(data.df.multi)[length(colnames(data.df.multi))] <- colnames(data.df)[i] - } else { - tmp <- model.matrix(~-1 + factor(data.df.lvl[, i])) - colnames(tmp) <- paste0(names(data.df.lvl)[i], levels(factor(data.df.lvl[, i]))) - data.df.multi <- as.data.frame(cbind(data.df.multi, tmp)) - } + if (data.dists[[i]] %in% c("binomial", "poisson", "gaussian")) { + data.df.multi <- as.data.frame(cbind(data.df.multi, data.df[, i])) + colnames(data.df.multi)[length(colnames(data.df.multi))] <- colnames(data.df)[i] + } else { + tmp <- model.matrix(~-1 + factor(data.df.lvl[, i])) + colnames(tmp) <- paste0(names(data.df.lvl)[i], levels(factor(data.df.lvl[, i]))) + data.df.multi <- as.data.frame(cbind(data.df.multi, tmp)) + } } - } - if (dry.run) { + } + if (dry.run) { return(mycache) + } } - ## EOF cache creation row.num <- NULL # To avoid check comment: 'no visible binding for global variable out <- list() @@ -607,7 +594,7 @@ buildScoreCache.mle <- cl <- makeCluster(ncores, type = control[["cluster.type"]], rscript_args = "--no-environ" # only available for "FORK" - ) + ) } registerDoParallel(cl) @@ -678,5 +665,5 @@ buildScoreCache.mle <- out[["method"]] <- "mle" return(out) -} + }