Merge pull request #6 from yufongpeng/release-0.2.0

Release 0.2.0
yufongpeng · Mar 16, 2023 · 63c2161 · 63c2161 · yufongpeng · Mar 16, 2023
2 parents b3d4fd6 + 4c0fcca
commit 63c2161
Show file tree

Hide file tree

Showing 7 changed files with 250 additions and 273 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "AnovaFixedEffectModels"
 uuid = "e4965305-65d6-464d-9c03-ae3e5cffadab"
 authors = ["yufongpeng <54415349+yufongpeng@users.noreply.github.com> and contributors"]
-version = "0.1.2"
+version = "0.2.0"
 
 [deps]
 AnovaBase = "946dddda-6a23-4b48-8e70-8e60d9b8d680"
@@ -16,11 +16,11 @@ StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
-AnovaBase = "0.6.3"
+AnovaBase = "0.7"
 Distributions = "0.23, 0.24, 0.25"
-FixedEffectModels = "1.3, 1.4, 1.5, 1.6, 1.7"
+FixedEffectModels = "1.3, 1.4, 1.5, 1.6, 1.7, 1.8"
 Reexport = "0.2, 1"
 StatsBase = "0.33"
 StatsModels = "0.6"
 Tables = "1.7"
-julia = "1.6, 1.7"
+julia = "1.6, 1.7, 1.8"
diff --git a/src/AnovaFixedEffectModels.jl b/src/AnovaFixedEffectModels.jl
@@ -6,10 +6,12 @@ using Statistics, StatsBase, LinearAlgebra, Distributions, Reexport, Printf
 import StatsBase: fit!, fit
 import StatsModels: TableRegressionModel, vectorize, width, apply_schema, 
                     ModelFrame, ModelMatrix, columntable, asgn
-import AnovaBase: ftest_nested, formula, anova, nestedmodels, _diff, _diffn, dof, dof_residual, deviance, nobs, coefnames
+
+using AnovaBase: select_super_interaction, extract_contrasts, canonicalgoodnessoffit, subformula, dof_asgn, lrt_nested, ftest_nested, _diff, _diffn
+import AnovaBase: anova, nestedmodels, anovatable, prednames, predictors, formula
 using Tables: columntable
 
-export anova_lfe, lfe, to_trm
+export anova_lfe, lfe
 
 include("anova.jl")
 include("fit.jl")

diff --git a/src/anova.jl b/src/anova.jl
@@ -1,176 +1,135 @@
 # ================================================================================================
 # Main API
-@doc """
-    anova(<models>...; test::Type{<: GoodnessOfFit})
-    anova(test::Type{<: GoodnessOfFit}, <models>...;  <keyword arguments>)
+"""
+    anova(<lfemodels>...; test::Type{<: GoodnessOfFit})
+    anova(test::Type{<: GoodnessOfFit}, <lfemodels>...;  <keyword arguments>)
 
 Analysis of variance.
 
-Return `AnovaResult{M, test, N}`.
+Return `AnovaResult{M, test, N}`. See [`AnovaResult`](@ref) for details.
 
-* `models`: model objects
-    1. `TableRegressionModel{<: FixedEffectModel}` fitted by `AnovaFixedEffectModels.lfe`.
-    If mutiple models are provided, they should be nested and the last one is the most saturated.
-* `test`: test statistics for goodness of fit. The default is based on the model type.
-    1. `TableRegressionModel{<: FixedEffectModel}`: `FTest`.
+# Arguments
+* `lfemodels`: model objects
+    1. `FixedEffectModel` fitted by `AnovaFixedEffectModels.lfe` or `FixedEffectModels.reg`.
+    If mutiple models are provided, they should be nested and the last one is the most complex.
+* `test`: test statistics for goodness of fit. Only `FTest` is available now.
 
-Other keyword arguments:
+# Other keyword arguments
 * When one model is provided:  
-    1. `type` specifies type of anova (1 or 3). Default value is 1.
+    1. `type` specifies type of anova. Default value is 1.
 * When multiple models are provided:  
     1. `check`: allows to check if models are nested. Defalut value is true. Some checkers are not implemented now.
-    2. `isnested`: true when models are checked as nested (manually or automatically). Defalut value is false. 
-
-Algorithm:
-
-For the ith model, devᵢ is defined as the sum of [squared deviance residuals (unit deviance)](https://en.wikipedia.org/wiki/Deviance_(statistics)). 
-It is equivalent to the residual sum.
-
-The attribute `deviance` is Δdevᵢ = devᵢ₋₁ - devᵢ.
-
-F-statistic is then defined as Δdevᵢ/(squared dispersion × degree of freedom).
 
-For type I and III ANOVA, F-statistic is computed directly by the variance-covariance matrix(vcov) of the saturated model; the deviance is calculated backward.
-1. Type I:
-
-    First, calculate f as the upper factor of Cholesky factorization of vcov⁻¹ * β.
-
-    For a factor that starts at ith row/column of vcov with n degree of freedom, the f-statistic is Σᵢⁱ⁺ⁿ⁻¹ fₖ²/n.
-2. Type III: 
-
-    For a factor occupying ith to jth row/column of vcov with n degree of freedom, f-statistic is (β[i:j]' * vcov[i:j, i:j]⁻¹ * β[i:j])/n.
-
-For fitting new models and conducting anova at the same time, see [`anova_lfe`](@ref) for `FixedEffectModel`.
+!!! note
+    For fitting new models and conducting anova at the same time, see [`anova_lfe`](@ref) for `FixedEffectModel`.
 """
-anova(::Val{:AnovaFixedEffectModels})
+anova(::Type{<: GoodnessOfFit}, ::Vararg{FixedEffectModel})
 
-anova(trms::Vararg{TableRegressionModel{<: FixedEffectModel}}; 
+anova(models::Vararg{M}; 
         test::Type{<: GoodnessOfFit} = FTest,
-        kwargs...) = 
-    anova(test, trms...; kwargs...)
+        kwargs...) where {M <: FixedEffectModel} = 
+    anova(test, models...; kwargs...)
 
 # ================================================================================================
 # ANOVA by F test
+anova(::Type{FTest}, 
+    model::M; 
+    type::Int = 1) where {M <: FixedEffectModel} = anova(FTest, FullModel(model, type, true, true))
 
 function anova(::Type{FTest}, 
-            trm::TableRegressionModel{<: FixedEffectModel};
-            type::Int = 1, kwargs...)
-
-    type == 2           && throw(ArgumentError("Type 2 anova is not implemented"))
-    type in [1, 2, 3]   || throw(ArgumentError("Invalid type"))
-    assign = trm.mm.assign
-    df = dof(assign)
-    filter!(>(0), df)
+            aovm::FullModel{M}) where {M <: FixedEffectModel}
+
+    assign = asgn(predictors(aovm))
+    fullpred = predictors(aovm.model)
+    fullasgn = asgn(fullpred)
+    df = filter(>(0), dof_asgn(assign))
     # May exist some floating point error from dof_residual
-    push!(df, round(Int, dof_residual(trm)))
-    df = tuple(df...)
-    if type in [1, 3] 
-        # vcov methods
-        varβ = vcov(trm)
-        β = trm.model.coef
-        if type == 1
-            fs = abs2.(cholesky(Hermitian(inv(varβ))).U * β) 
-            offset = first(assign) - 1
-            fstat = ntuple(last(assign) - offset) do fix
-                sum(fs[findall(==(fix + offset), assign)]) / df[fix]
-            end
-        else
-            # calculate block by block
-            offset = first(assign) - 1
-            fstat = ntuple(last(assign) - offset) do fix
-                select = findall(==(fix + offset), assign)
-                β[select]' * (varβ[select, select] \ β[select]) / df[fix]
-            end
+    varβ = vcov(aovm.model)
+    β = aovm.model.coef
+    offset = first(assign) + last(fullasgn) - last(assign) - 1
+    if aovm.type == 1
+        fs = abs2.(cholesky(Hermitian(inv(varβ))).U * β)
+        fstat = ntuple(last(fullasgn) - offset) do fix
+            sum(fs[findall(==(fix + offset), fullasgn)]) / df[fix]
+        end
+    elseif aovm.type == 2
+        fstat = ntuple(last(fullasgn) - offset) do fix
+            select1 = sort!(collect(select_super_interaction(fullpred, fix + offset)))
+            select2 = setdiff(select1, fix + offset)
+            select1 = findall(in(select1), fullasgn)
+            select2 = findall(in(select2), fullasgn)
+            (β[select1]' * (varβ[select1, select1] \ β[select1]) - β[select2]' * (varβ[select2, select2] \ β[select2])) / df[fix]
+        end
+    else
+        # calculate block by block
+        fstat = ntuple(last(fullasgn) - offset) do fix
+            select = findall(==(fix + offset), fullasgn)
+            β[select]' * (varβ[select, select] \ β[select]) / df[fix]
         end
-        σ² = rss(trm.model) / last(df)
-        devs = (fstat .* σ²..., σ²) .* df
     end
-    pvalue = (ccdf.(FDist.(df[1:end - 1], last(df)), abs.(fstat))..., NaN)
-    AnovaResult{FTest}(trm, type, df, devs, (fstat..., NaN), pvalue, NamedTuple())
+    dfr = round(Int, dof_residual(aovm.model))
+    σ² = rss(aovm.model) / dfr
+    devs = @. fstat * σ² * df
+    pvalue = @. ccdf(FDist(df, dfr), abs(fstat))
+    AnovaResult{FTest}(aovm, df, devs, fstat, pvalue, NamedTuple())
 end
 
 # =================================================================================================================
 # Nested models 
 
 function anova(::Type{FTest}, 
-                trms::Vararg{TableRegressionModel{<: FixedEffectModel}}; 
-                check::Bool = true,
-                isnested::Bool = false)
+                models::Vararg{M}; 
+                check::Bool = true) where {M <: FixedEffectModel}
 
-    df = dof.(trms)
+    df = dof_pred.(models)
     ord = sortperm(collect(df))
     df = df[ord]
-    trms = trms[ord]
+    models = models[ord]
     # May exist some floating point error from dof_residual
-    dfr = round.(Int, dof_residual.(trms))
-    dev = ntuple(length(trms)) do i 
-        trms[i].model.rss
-    end
-
+    dfr = round.(Int, dof_residual.(models))
+    dev = rss.(models)
     # check comparable and nested
     check && @warn "Could not check whether models are nested: results may not be meaningful"
-    ftest_nested(trms, df, dfr, dev, last(dev) / last(dfr))
+    ftest_nested(NestedModels{M}(models), df, dfr, dev, last(dev) / last(dfr))
 end
 
+anova(::Type{FTest}, aovm::NestedModels{M}) where {M <: FixedEffectModel} = 
+    lrt_nested(aovm, dof_pred.(aovm.model), rss.(aovm.model), 1)
+
 """
     lfe(formula::FormulaTerm, df, vcov::CovarianceEstimator = Vcov.simple(); kwargs...)
 
-Fit a `FixedEffectModel` and wrap it into `TableRegressionModel`. 
-!!! warn
-    This function currently does not perform well. It re-compiles everytime; may be due to `@nonspecialize` for parameters of `reg`.
+A `GLM`-styled function to fit a fixed-effect model. 
 """
 lfe(formula::FormulaTerm, df, vcov::CovarianceEstimator = Vcov.simple(); kwargs...) = 
-    to_trm(reg(df, formula, vcov; kwargs...), df)
-
-"""
-    to_trm(model, df)
-
-Wrap fitted `FixedEffectModel` into `TableRegressionModel`.
-"""
-function to_trm(model::FixedEffectModel, df)
-    f = model.formula
-    has_fe_intercept = any(fe_intercept(f))
-    rhs = vectorize(f.rhs)
-    f = isa(first(rhs), ConstantTerm) ? f : FormulaTerm(f.lhs, (ConstantTerm(1), rhs...))
-    s = schema(f, df, model.contrasts)
-    f = apply_schema(f, s, FixedEffectModel, has_fe_intercept)
-    mf = ModelFrame(f, s, columntable(df[!, getproperty.(keys(s), :sym)]), FixedEffectModel)
-    # Fake modelmatrix
-    assign = mapreduce(((i, t), ) -> i*ones(width_fe(t)),
-                        append!,
-                        enumerate(vectorize(f.rhs.terms)),
-                        init=Int[])
-    has_fe_intercept && popfirst!(assign)
-    mm = ModelMatrix(ones(Float64, 1, 1), assign)
-    TableRegressionModel(model, mf, mm)
-end
+    reg(df, formula, vcov; kwargs...)
 
 # =================================================================================================================================
 # Fit new models
 """
-    anova_lfe(f::FormulaTerm, df, vcov::CovarianceEstimator = Vcov.simple(); 
+    anova_lfe(f::FormulaTerm, tbl, vcov::CovarianceEstimator = Vcov.simple(); 
             test::Type{<: GoodnessOfFit} = FTest, <keyword arguments>)
-    anova_lfe(test::Type{<: GoodnessOfFit}, f::FormulaTerm, df, vcov::CovarianceEstimator = Vcov.simple(); <keyword arguments>)
+    anova_lfe(test::Type{<: GoodnessOfFit}, f::FormulaTerm, tbl, vcov::CovarianceEstimator = Vcov.simple(); <keyword arguments>)
 
 ANOVA for fixed-effect linear regression.
 * `vcov`: estimator of covariance matrix.
-* `type`: type of anova (1 or 3). Default value is 1.
+* `type`: type of anova (1 , 2 or 3). Default value is 1.
 
-`anova_lfe` generate a `TableRegressionModel{<: FixedEffectModel}`.
+`anova_lfe` generates a `FixedEffectModel`.
 """
-anova_lfe(f::FormulaTerm, df, vcov::CovarianceEstimator = Vcov.simple(); 
+anova_lfe(f::FormulaTerm, tbl, vcov::CovarianceEstimator = Vcov.simple(); 
         test::Type{<: GoodnessOfFit} = FTest, 
         kwargs...)= 
-    anova(test, FixedEffectModel, f, df, vcov; kwargs...)
+    anova(test, FixedEffectModel, f, tbl, vcov; kwargs...)
 
-anova_lfe(test::Type{<: GoodnessOfFit}, f::FormulaTerm, df, vcov::CovarianceEstimator = Vcov.simple(); kwargs...) = 
-    anova(test, FixedEffectModel, f, df, vcov; kwargs...)
+anova_lfe(test::Type{<: GoodnessOfFit}, f::FormulaTerm, tbl, vcov::CovarianceEstimator = Vcov.simple(); kwargs...) = 
+    anova(test, FixedEffectModel, f, tbl, vcov; kwargs...)
 
-function anova(test::Type{<: GoodnessOfFit}, ::Type{FixedEffectModel}, f::FormulaTerm, df, vcov::CovarianceEstimator = Vcov.simple(); 
+function anova(test::Type{<: GoodnessOfFit}, ::Type{FixedEffectModel}, f::FormulaTerm, tbl, vcov::CovarianceEstimator = Vcov.simple(); 
         type::Int = 1, 
         kwargs...)
-    trm = to_trm(reg(df, f, vcov; kwargs...), df)
-    anova(test, trm; type)
+    model = lfe(f, tbl, vcov; kwargs...)
+    anova(test, model; type)
 end
 
 
diff --git a/src/fit.jl b/src/fit.jl
@@ -1,14 +1,8 @@
 # ==========================================================================================================
 # Backend funcion
-
-fe_intercept(f::FormulaTerm) = fe_intercept(f.rhs)
-fe_intercept(term::StatsModels.TupleTerm) = map(fe_intercept, term)
-fe_intercept(term::FunctionTerm) = first(term.exorig.args) == :fe 
-fe_intercept(term) = false
-
-width_fe(term::FunctionTerm) = first(term.exorig.args) == :fe ? 0 : 1
-width_fe(ts::InteractionTerm) = prod(width_fe(t) for t in ts.terms)
-width_fe(term) = width(term)
+formula(model::T) where {T <: FixedEffectModel} = model.formula
+predictors(model::T) where {T <: FixedEffectModel} = model.formula_schema.rhs.terms
 
 # Variable dispersion
-dof(trm::TableRegressionModel{<: FixedEffectModel}) = trm.model.nobs - trm.model.dof_residual + 1
+dof_pred(model::FixedEffectModel) = nobs(model) - dof_residual(model)
+# define dof on NestedModels?
diff --git a/src/io.jl b/src/io.jl
@@ -1,46 +1,41 @@
 # ======================================================================================================
 # IO
-import AnovaBase: AnovaTable, anovatable
-function coefnames(aov::AnovaResult{T, FTest}; kwargs...) where {T <: TableRegressionModel{<: FixedEffectModel}}
-    v = coefnames(aov.model, Val(:anova))
-    push!(v, "(Residuals)")
-    v
-end
-
-coefnames(trm::TableRegressionModel{<: FixedEffectModel}, anova::Val{:anova}) =
-    vectorize(coefnames(formula(trm).rhs.terms[unique(trm.mm.assign)], anova))
-
 # anovatable api
-function anovatable(aov::AnovaResult{<: TableRegressionModel{<: FixedEffectModel}, FTest}; kwargs...)
-    AnovaTable(hcat(vectorize.((dof(aov), deviance(aov), deviance(aov) ./ dof(aov), teststat(aov), pval(aov)))...),
+function anovatable(aov::AnovaResult{<: FullModel{<: T}, FTest}; rownames = push!(prednames(aov), "(Residuals)")) where {T <: FixedEffectModel}
+    dfr = round(Int, dof_residual(aov.anovamodel.model))
+    σ² = rss(aov.anovamodel.model) / dfr
+    AnovaTable([
+                [dof(aov)..., dfr], 
+                [deviance(aov)..., dfr * σ²], 
+                [(deviance(aov) ./ dof(aov))..., σ²], 
+                [teststat(aov)..., NaN], 
+                [pval(aov)..., NaN]
+                ],
               ["DOF", "Exp.SS", "Mean Square", "F value","Pr(>|F|)"],
-              ["x$i" for i in eachindex(pval(aov))], 5, 4)
+              rownames, 5, 4)
 end 
 
-function anovatable(aov::AnovaResult{<: Tuple, FTest}, 
-                    modeltype1::Type{<: TableRegressionModel{<: FixedEffectModel}}, 
-                    modeltype2::Type{<: TableRegressionModel{<: FixedEffectModel}};
-                    kwargs...)
+function anovatable(aov::AnovaResult{<: NestedModels{<: FixedEffectModel, N}, FTest}; rownames = string.(1:N)) where N
 
-    rs = r2.(aov.model)
-    rws = ntuple(length(aov.model)) do i 
-        aov.model[i].model.r2_within
+    rs = r2.(aov.anovamodel.model)
+    rws = ntuple(length(aov.anovamodel.model)) do i 
+        aov.anovamodel.model[i].r2_within
     end
     Δrs = _diff(rs)
     Δrws = _diff(rws)
-    AnovaTable(hcat(vectorize.((
+    AnovaTable([
                     dof(aov), 
                     [NaN, _diff(dof(aov))...], 
-                    dof_residual(aov), 
+                    repeat([round(Int, dof_residual(last(aov.anovamodel.model)))], N), 
                     rs,
                     [NaN, Δrs...],
                     rws,
                     [NaN, Δrws...],
-                    deviance(aov), 
+                    deviance(aov),                     
                     [NaN, _diffn(deviance(aov))...], 
                     teststat(aov), 
                     pval(aov)
-                    ))...),
+                ],
                 ["DOF", "ΔDOF", "Res.DOF", "R²", "ΔR²", "R²_within", "ΔR²_within", "Res.SS", "Exp.SS", "F value", "Pr(>|F|)"],
-                ["$i" for i in eachindex(pval(aov))], 11, 10)
+                rownames, 11, 10)
 end