diff --git a/.github/workflows/onlabel_rebuild_vignettes.yml b/.github/workflows/onlabel_rebuild_vignettes.yml new file mode 100644 index 00000000..346e3c86 --- /dev/null +++ b/.github/workflows/onlabel_rebuild_vignettes.yml @@ -0,0 +1,197 @@ +# Workflow triggered when we have a new release candidate +# This action is adapted from https://github.com/t4d-gmbh/stubbed_versioning +name: On Label rebuild vignettes + +on: + pull_request: + types: [ labeled ] + +env: + LABEL_CHECK: 'CompVignettes::build' + LABEL_SUCCESS: 'CompVignettes::passed' + LABEL_FAILURE: 'CompVignettes::failed' + DOC_LOC: "./docs" + BRANCH: ${{ github.head_ref || github.ref_name }} + +jobs: + set_target: + if: ${{ github.event.label.name == 'CompVignettes::build' }} + runs-on: ubuntu-latest + outputs: + label: ${{ steps.set_label.outputs.label }} + steps: + - id: set_label + run: | + echo "label=${{ env.LABEL_CHECK }}" >> "$GITHUB_OUTPUT" + + check_label_exist: + needs: + - set_target + runs-on: ubuntu-latest + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OWNER: ${{ github.repository_owner }} + REPO: ${{ github.event.repository.name }} + permissions: + pull-requests: write + contents: write + steps: + - name: Assert labels ${{ env.LABEL_CHECK }} is defined + run: | + gh label create ${{ env.LABEL_CHECK }} --repo ${{ env.OWNER }}/${{ env.REPO }} + continue-on-error: true # make sure the next steps run also on failure + - name: Assert labels ${{ env.LABEL_SUCCESS }} is defined + run: | + gh label create ${{ env.LABEL_SUCCESS }} --repo ${{ env.OWNER }}/${{ env.REPO }} + continue-on-error: true # make sure the next steps run also on failure + - name: Assert labels ${{ env.LABEL_FAILURE }} is defined + run: | + gh label create ${{ env.LABEL_FAILURE }} --repo ${{ env.OWNER }}/${{ env.REPO }} + continue-on-error: true # make sure the next steps run also on failure + + vignette_build: + runs-on: ubuntu-latest + needs: + - set_target + container: + image: ${{ vars.CONTAINER_SOURCE }}/${{ matrix.os }}/${{ matrix.compiler }}/${{ matrix.r-version }}/abn:${{ vars.CONTAINER_VERSION || 'latest' }} + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + env: + R_KEEP_PKG_SOURCE: yes # NOTE Jo: not sure why this is here? + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + EVENT: ${{ github.event.number }} + strategy: + fail-fast: false + matrix: + r-version: ['release'] + os: ['debian'] + compiler: ['gcc'] + permissions: + contents: write + id-token: write + steps: + - uses: actions/checkout@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout branch of PR + run: | + apt-get update && apt-get install -y gh + git config --global --add safe.directory /__w/abn/abn + gh pr checkout ${{ env.EVENT }} + - name: Disable renv + run: | + renv::deactivate() + shell: Rscript {0} + - name: Configure ABN for installation + run: | + autoreconf + - name: Install package dependencies + run: | + devtools::install_deps(pkg = '.', dependencies = TRUE, upgrade='never') + shell: Rscript {0} + - name: Install ABN + run: | + R CMD INSTALL . + - name: Rebuild Vignettes + run: | + source("vignettes/precompile.R") + shell: Rscript {0} + - name: Copy Figures to Vignettes folder + run: | + mv ./*.png vignettes/ + - uses: actions/upload-artifact@v4 + with: + path: vignettes/ + - name: Commit compiled vignettes to branch + run: | + git config --global user.name 'GitHub Vignette Bot' + git config --global user.email 'vignettebot@github.com' + git config --global --add safe.directory /__w/abn/abn + git add vignettes/ + git add *.png + git commit -m "Automated Vignette Compilation" + git push + + report_vignette_build: + if: ${{ (success() || failure()) }} + needs: + - vignette_build + - check_label_exist + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + repository-projects: write + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OWNER: ${{ github.repository_owner }} + REPO: ${{ github.event.repository.name }} + steps: + - uses: actions/checkout@v4 + - name: Check if on demand tasks succeeded + run: | + gh pr edit ${{ env.EVENT }} --remove-label ${{ env.LABEL_CHECK }} --repo ${{ env.OWNER }}/${{ env.REPO }} + if [ ${{ needs.vignette_build.result }} == "success" ]; then + gh pr edit ${{ env.EVENT }} --remove-label ${{ env.LABEL_FAILURE }} --repo ${{ env.OWNER }}/${{ env.REPO }} + gh pr edit ${{ env.EVENT }} --add-label ${{ env.LABEL_SUCCESS }} --repo ${{ env.OWNER }}/${{ env.REPO }} + echo "### ${{ github.event.label.url }} passed! :rocket:" >> $GITHUB_STEP_SUMMARY + exit 0 + elif [ ${{ needs.vignette_build.result }} == "failure" ]; then + gh pr edit ${{ env.EVENT }} --remove-label ${{ env.LABEL_SUCCESS }} --repo ${{ env.OWNER }}/${{ env.REPO }} + gh pr edit ${{ env.EVENT }} --add-label ${{ env.LABEL_FAILURE }} --repo ${{ env.OWNER }}/${{ env.REPO }} + echo "### ${{ github.event.label.url }} failed!" >> $GITHUB_STEP_SUMMARY + exit 1 + else + gh pr edit ${{ env.EVENT }} --add-label ${{ env.LABEL_CHECK }} --repo ${{ env.OWNER }}/${{ env.REPO }} + echo "On demand task outcome was ${{ steps.some_task.outcome }}" + fi + shell: bash + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + EVENT: ${{ github.event.number }} # This is either the issue or pr + + record_passed_label: + runs-on: ubuntu-latest + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OWNER: ${{ github.repository_owner }} + REPO: ${{ github.event.repository.name }} + permissions: + contents: write + pull-requests: write + repository-projects: write + outputs: + passed: ${{ steps.passed.outputs.PASSED}} + steps: + - name: Check if the pull request is labeled with ${{ env.LABEL_SUCCESS }} # 2 + id: passed + run: | + if $( gh pr view ${{ env.EVENT }} --repo ${{ env.OWNER }}/${{ env.REPO }} --json "labels" --jq ".[].[].name" | grep --quiet ${{ env.LABEL_SUCCESS }}); then + echo "PASSED=true" >> $GITHUB_OUTPUT + else + echo "PASSED=false" >> $GITHUB_OUTPUT + fi + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + EVENT: ${{ github.event.number }} # This is either the issue or pr + + vignette_build_passed: + if: ${{ always() }} + needs: + - vignette_build + - record_passed_label + runs-on: ubuntu-latest + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OWNER: ${{ github.repository_owner }} + REPO: ${{ github.event.repository.name }} + steps: + - name: Assert that either job passed or the label is present + run: | + if [[ ${{ needs.vignette_build.result }} == 'success' || ${{ needs.record_passed_label.outputs.passed }} == 'true' ]]; then + echo 'vignette_build status ok'; + else exit 1; fi + diff --git a/DESCRIPTION b/DESCRIPTION index 01629c25..7e924332 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -80,4 +80,4 @@ Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.1 -SystemRequirements: Gnu Scientific Library version >= 1.12 +SystemRequirements: Gnu Scientific Library version >= 1.12, jpeg, gdal, geos, proj diff --git a/vignettes/benchmarkBoxPlot-1.png b/vignettes/benchmarkBoxPlot-1.png index eebd85f9..73348c78 100644 Binary files a/vignettes/benchmarkBoxPlot-1.png and b/vignettes/benchmarkBoxPlot-1.png differ diff --git a/vignettes/data_simulation.Rmd b/vignettes/data_simulation.Rmd index 1a0ed2d6..8f6c747e 100644 --- a/vignettes/data_simulation.Rmd +++ b/vignettes/data_simulation.Rmd @@ -10,8 +10,15 @@ vignette: > -```r +``` r library(abn) +#> abn version 3.1.1 (2024-05-22) is loaded. +#> To cite the package 'abn' in publications call: citation('abn'). +#> +#> Attaching package: 'abn' +#> The following object is masked from 'package:base': +#> +#> factorial ``` In this vignette, we will simulate data from an additive Bayesian network and compare it to the original data. @@ -22,7 +29,7 @@ First, we will fit a model to the original data that we will use to simulate new We will use the `ex1.dag.data` data set and fit a model to it. -```r +``` r # Load example data mydat <- ex1.dag.data @@ -43,6 +50,13 @@ mycache <- buildScoreCache(data.df = mydat, data.dists = mydists, method = "bayes", max.parents = 4) +#> Warning: package 'INLA' was built under R version 4.4.1 +#> Loading required package: Matrix +#> Loading required package: sp +#> This is INLA_24.06.27 built 2024-06-27 02:36:04 UTC. +#> - See www.r-inla.org/contact-us for how to get help. +#> - List available models/likelihoods/etc with inla.list.models() +#> - Use inla.doc() to access documentation # Structure learning mp.dag <- mostProbable(score.cache = mycache) @@ -64,7 +78,7 @@ Based on the `abnFit` object, we can simulate new data. By default `simulateAbn()` synthesizes 1000 new data points. -```r +``` r mydat_sim <- simulateAbn(object = myfit) str(mydat_sim) #> 'data.frame': 1000 obs. of 10 variables: @@ -86,7 +100,7 @@ Especially for debugging purposes, it can be usefull to manually inspect the BUG This can be done by not running the simulation with `run.simulation = FALSE` and print the BUGS file to console with `verbose = TRUE`. -```r +``` r # Simulate new data and print the BUGS file to the console simulateAbn(object = myfit, run.simulation = FALSE, @@ -100,13 +114,13 @@ To store the BUGS file for reproducibility or manual inspection, we can set the We can compare the original and simulated data by plotting the distributions of the variables. -```r +``` r # order the columns of mydat equal to mydat_sim mydat <- mydat[, colnames(mydat_sim)] ``` -```r +``` r library(ggplot2) library(gridExtra) @@ -152,7 +166,7 @@ for (i in seq_along(variables)) { -```r +``` r # Print all plots do.call(grid.arrange, c(plots, ncol = 1)) ``` diff --git a/vignettes/fit_model-1.png b/vignettes/fit_model-1.png index 8e71d2a0..bd44132f 100644 Binary files a/vignettes/fit_model-1.png and b/vignettes/fit_model-1.png differ diff --git a/vignettes/multiprocessing.Rmd b/vignettes/multiprocessing.Rmd index ca905c00..ac782456 100644 --- a/vignettes/multiprocessing.Rmd +++ b/vignettes/multiprocessing.Rmd @@ -40,7 +40,8 @@ We will use the `microbenchmark` package to measure the time it takes to compute ## Load the data and specify the parameters -```r + +``` r library(abn) library(microbenchmark) @@ -48,7 +49,7 @@ set.seed(123456) ``` -```r +``` r # Prepare data and parameters df <- FCV[, -c(13)] mydists <- list(FCV = "binomial", @@ -81,7 +82,7 @@ We compare the following methods: - `bayesMulticoreFORK`: Bayesian estimation on 2 cores using FORK -```r +``` r # Benchmark res <- microbenchmark(mleSinglecore = buildScoreCache(data.df = df, data.dists = mydists, @@ -127,7 +128,7 @@ res <- microbenchmark(mleSinglecore = buildScoreCache(data.df = df, ``` -```r +``` r boxplot(res) ``` @@ -140,17 +141,10 @@ This is due to the efficient implementation of the score cache computation in th It leverages either an internal C/C++ implementation or INLA, an efficient implementation of the Bayesian approach. The method selection, by default, is automatic and depends on the specific use case. The frequentist approach on the other hand relies on other R packages, which introduces a higher overhead. -The multicore approach is generally faster than the singlecore approach. -This is particularly noticeable for the frequentist approach, where both multicore methods surpass the singlecore method in speed. -The Bayesian approach is already highly efficient, so the gain from using multiple cores is not as pronounced. - -For the Bayesian approach, the FORK method is generally faster than the PSOCK method. +The FORK method is generally faster than the PSOCK method, especially for the Bayesian approach. This is because the FORK method shares memory objects between the processes, leading to significant efficiencies with large objects. In contrast, the PSOCK method creates a set of independent R processes and communicates between them using sockets, which introduces a higher memory overhead. -For this example, the difference to the single core approach is not significant, likely because the problem is not large enough to greatly benefit from parallelization. - -Interestingly, for the frequentist approach, the PSOCK method appears to be generally faster than the FORK method. -This can occur when the overhead of copying large objects in memory outweighs the benefits of shared memory in the FORK method. +For this example, PSOCK shows worse performance than FORK, likely because the problem is not large enough to greatly benefit from parallelization. In conclusion, while the Bayesian approach is generally faster than the frequentist approach, the speed up is larger in the frequentist approach. However, the choice between FORK and PSOCK depends on the operating system and the specific use case. diff --git a/vignettes/multiprocessing.Rmd.orig b/vignettes/multiprocessing.Rmd.orig index 911d8525..dd1aba27 100644 --- a/vignettes/multiprocessing.Rmd.orig +++ b/vignettes/multiprocessing.Rmd.orig @@ -141,17 +141,10 @@ This is due to the efficient implementation of the score cache computation in th It leverages either an internal C/C++ implementation or INLA, an efficient implementation of the Bayesian approach. The method selection, by default, is automatic and depends on the specific use case. The frequentist approach on the other hand relies on other R packages, which introduces a higher overhead. -The multicore approach is generally faster than the singlecore approach. -This is particularly noticeable for the frequentist approach, where both multicore methods surpass the singlecore method in speed. -The Bayesian approach is already highly efficient, so the gain from using multiple cores is not as pronounced. - -For the Bayesian approach, the FORK method is generally faster than the PSOCK method. +The FORK method is generally faster than the PSOCK method, especially for the Bayesian approach. This is because the FORK method shares memory objects between the processes, leading to significant efficiencies with large objects. In contrast, the PSOCK method creates a set of independent R processes and communicates between them using sockets, which introduces a higher memory overhead. -For this example, the difference to the single core approach is not significant, likely because the problem is not large enough to greatly benefit from parallelization. - -Interestingly, for the frequentist approach, the PSOCK method appears to be generally faster than the FORK method. -This can occur when the overhead of copying large objects in memory outweighs the benefits of shared memory in the FORK method. +For this example, PSOCK shows worse performance than FORK, likely because the problem is not large enough to greatly benefit from parallelization. In conclusion, while the Bayesian approach is generally faster than the frequentist approach, the speed up is larger in the frequentist approach. However, the choice between FORK and PSOCK depends on the operating system and the specific use case. diff --git a/vignettes/parameter_learning.Rmd b/vignettes/parameter_learning.Rmd index 483c5ca5..69dbb867 100644 --- a/vignettes/parameter_learning.Rmd +++ b/vignettes/parameter_learning.Rmd @@ -10,7 +10,7 @@ vignette: > -```r +``` r library(abn) ``` @@ -33,7 +33,7 @@ Alternatively, with `fitAbn()` we can fit the parameters to a manually specified In this case, we need to provide the data, its distributions and the DAG as arguments to `fitAbn()`: -```r +``` r # Load an illustrative subset of the example data mydat <- ex0.dag.data[,c("b1","b2","b3","g1","b4","p2","p4")] @@ -61,7 +61,7 @@ colnames(mydag) <- rownames(mydag) <- names(mydat) Fit the model to calculate the log marginal likelihood goodness of fit: -```r +``` r myres.c <- fitAbn(dag = mydag, data.df = mydat, data.dists = mydists, @@ -76,7 +76,7 @@ plot(myres.c) We can examine the parameter estimates with the `print()` method for the class `abnFit` -```r +``` r print(myres.c) #> The ABN model was fitted using a Bayesian approach. The estimated modes (the highest posterior density values of the parameters) are: #> @@ -121,7 +121,7 @@ This is useful to assess the uncertainty in the parameter estimates and to check Now fit the model with `compute.fixed=TRUE` to calculate the marginal posterior distributions for all parameters: -```r +``` r myres.c2 <- fitAbn(dag = mydag, data.df = mydat, data.dists = mydists, @@ -133,7 +133,7 @@ The `marginals` slot of the `abnFit` object returned by `fitAbn()` contains a li We can plot some of the marginal posterior densities to assess the uncertainty in the parameter estimates: -```r +``` r library(ggplot2) library(gridExtra) diff --git a/vignettes/unnamed-chunk-3-1.png b/vignettes/unnamed-chunk-3-1.png index b10ed12a..6feed4b4 100644 Binary files a/vignettes/unnamed-chunk-3-1.png and b/vignettes/unnamed-chunk-3-1.png differ diff --git a/vignettes/unnamed-chunk-6-1.png b/vignettes/unnamed-chunk-6-1.png index 5badcd79..815e5e73 100644 Binary files a/vignettes/unnamed-chunk-6-1.png and b/vignettes/unnamed-chunk-6-1.png differ