Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 5 additions & 40 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,70 +1,35 @@
name: CI
on:
pull_request:
branches:
- master
push:
branches:
- master
tags: '*'
branches: [master]
tags: ['*']
workflow_dispatch:
jobs:
test:
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
continue-on-error: ${{ matrix.version == 'nightly' }}
strategy:
fail-fast: false
matrix:
version:
- '1.10'
- '1'
# - 'nightly'
os:
- ubuntu-latest
arch:
- x64
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6
- uses: julia-actions/setup-julia@v2
with:
version: ${{ matrix.version }}
arch: ${{ matrix.arch }}
- uses: actions/cache@v4
env:
cache-name: cache-artifacts
with:
path: ~/.julia/artifacts
key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
restore-keys: |
${{ runner.os }}-test-${{ env.cache-name }}-
${{ runner.os }}-test-
${{ runner.os }}-
- uses: julia-actions/cache@v2
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
- uses: julia-actions/julia-processcoverage@v1
- uses: codecov/codecov-action@v5
with:
files: lcov.info
# docs:
# name: Documentation
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v4
# - uses: julia-actions/setup-julia@v2
# with:
# version: '1'
# - run: |
# julia --project=docs -e '
# using Pkg
# Pkg.develop(PackageSpec(path=pwd()))
# Pkg.instantiate()'
# - run: |
# julia --project=docs -e '
# using Documenter: doctest
# using ForwardDiff
# doctest(ForwardDiff)'
# - run: julia --project=docs docs/make.jl
# env:
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
token: ${{ secrets.CODECOV_TOKEN }}
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "FixedEffectModels"
uuid = "9d5cd8c9-2029-5cab-9928-427838db53e3"
version = "1.13.0"
version = "1.13.1"

[deps]
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ formula(m::FixedEffectModel)
predict(m::FixedEffectModel, df)
residuals(m::FixedEffectModel, df)
```
Note that the functions `predict` and `residuals` require a table (`df`) as a second argument because the object returned by `reg` does not store the original dataset (to keep the model lightweight). For background on the tradeoff of storing the original data inside fitted model objects, see [1](http://www.r-bloggers.com/trimming-the-fat-from-glm-models-in-r/), [2](https://blogs.oracle.com/R/entry/is_the_size_of_your), [3](http://stackoverflow.com/questions/21896265/how-to-minimize-size-of-object-of-class-lm-without-compromising-it-being-passe), [4](http://stackoverflow.com/questions/15260429/is-there-a-way-to-compress-an-lm-class-for-later-prediction), [5](http://stackoverflow.com/questions/26010742/using-stargazer-with-memory-greedy-glm-objects), and [6](http://stackoverflow.com/questions/22577161/not-enough-ram-to-run-stargazer-the-normal-way).
Note that the functions `predict` and `residuals` require a table (`df`) as a second argument because the object returned by `reg` does not store the original dataset (to keep the model lightweight). For models with fixed effects, `predict` additionally requires estimating the model with `save = :fe` (or `:all`), and in-sample residual retrieval requires `save = :residuals` (or `:all`). For background on the tradeoff of storing the original data inside fitted model objects, see [1](http://www.r-bloggers.com/trimming-the-fat-from-glm-models-in-r/), [2](https://blogs.oracle.com/R/entry/is_the_size_of_your), [3](http://stackoverflow.com/questions/21896265/how-to-minimize-size-of-object-of-class-lm-without-compromising-it-being-passe), [4](http://stackoverflow.com/questions/15260429/is-there-a-way-to-compress-an-lm-class-for-later-prediction), [5](http://stackoverflow.com/questions/26010742/using-stargazer-with-memory-greedy-glm-objects), and [6](http://stackoverflow.com/questions/22577161/not-enough-ram-to-run-stargazer-the-normal-way).

Finally, you can use [RegressionTables.jl](https://github.com/jmboehm/RegressionTables.jl) to get publication-quality regression tables.

Expand Down
56 changes: 34 additions & 22 deletions src/FixedEffectModel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -113,24 +113,23 @@ end

# predict, residuals, modelresponse

# Utility functions for checking whether FE/continuous interactions are in formula
# These are currently not supported in predict
function is_cont_fe_int(x)
# Utility functions for checking whether FE/non-FE interactions are in formula.
# These are currently not supported in predict.
function is_cont_fe_int(x)
x isa InteractionTerm || return false
any(x -> isa(x, Term), x.terms) && any(x -> isa(x, FunctionTerm{typeof(fe), Vector{Term}}), x.terms)
any(has_fe, x.terms) && any(t -> !has_fe(t), x.terms)
end

# Does the formula have InteractionTerms?
has_cont_fe_interaction(::AbstractTerm) = false
has_cont_fe_interaction(x::InteractionTerm) = is_cont_fe_int(x)
has_cont_fe_interaction(x::NTuple{N, AbstractTerm}) where {N} = any(has_cont_fe_interaction, x)
function has_cont_fe_interaction(x::FormulaTerm)
if x.rhs isa AbstractTerm # only one term
is_cont_fe_int(x)
elseif hasfield(typeof(x.rhs), :lhs) # Is an IV term
false # Is this correct?
else
any(is_cont_fe_int, x.rhs)
end
has_cont_fe_interaction(x.lhs) || any(has_cont_fe_interaction, eachterm(x.rhs))
end

_is_no_regressor_rhs(rhs) = rhs == MatrixTerm((InterceptTerm{false}(),))
_is_intercept_only_rhs(rhs) = rhs == MatrixTerm((InterceptTerm{true}(),))

function StatsAPI.predict(m::FixedEffectModel, data)
Tables.istable(data) ||
throw(ArgumentError("Expected second argument to be a Table, got $(typeof(data))"))
Expand All @@ -140,11 +139,13 @@ function StatsAPI.predict(m::FixedEffectModel, data)

# only fixed effects
cdata = StatsModels.columntable(data)
nrows = length(Tables.rows(cdata))
if m.formula_schema.rhs == MatrixTerm((InterceptTerm{false}(),))
has_fe(m) || throw(ArgumentError("To be used with predict, a model requires regressors or fixed effects"))
nrows = length(Tables.rows(data))
if _is_no_regressor_rhs(m.formula_schema.rhs)
out = zeros(Float64, nrows)
nonmissings = trues(nrows)
elseif _is_intercept_only_rhs(m.formula_schema.rhs)
out = fill(only(m.coef), nrows)
nonmissings = trues(nrows)
else
cols, nonmissings = StatsModels.missing_omit(cdata, m.formula_schema.rhs)
Xnew = modelmatrix(m.formula_schema, cols)
Expand Down Expand Up @@ -186,14 +187,26 @@ function StatsAPI.residuals(m::FixedEffectModel, data)
has_fe(m) &&
throw(ArgumentError("To access residuals for a model with high-dimensional fixed effects, run `m = reg(..., save = :residuals)` and then access residuals with `residuals(m)`."))
cdata = StatsModels.columntable(data)
cols, nonmissings = StatsModels.missing_omit(cdata, m.formula_schema.rhs)
Xnew = modelmatrix(m.formula_schema, cols)
y = response(m.formula_schema, cdata)
if all(nonmissings)
out = y - Xnew * m.coef
if _is_no_regressor_rhs(m.formula_schema.rhs)
out = y
elseif _is_intercept_only_rhs(m.formula_schema.rhs)
observed = .!ismissing.(y)
if all(observed)
out = y .- only(m.coef)
else
out = Vector{Union{Float64, Missing}}(missing, length(y))
out[observed] = y[observed] .- only(m.coef)
end
else
out = Vector{Union{Float64, Missing}}(missing, length(Tables.rows(cdata)))
out[nonmissings] = y - Xnew * m.coef
cols, nonmissings = StatsModels.missing_omit(cdata, m.formula_schema.rhs)
Xnew = modelmatrix(m.formula_schema, cols)
if all(nonmissings)
out = y - Xnew * m.coef
else
out = Vector{Union{Float64, Missing}}(missing, length(Tables.rows(data)))
out[nonmissings] = y - Xnew * m.coef
end
end
return out
end
Expand Down Expand Up @@ -361,4 +374,3 @@ function StatsModels.apply_schema(t::FormulaTerm, schema::StatsModels.Schema, Mo
FormulaTerm(apply_schema(t.lhs, schema.schema, StatisticalModel),
StatsModels.collect_matrix_terms(apply_schema(t.rhs, schema, StatisticalModel)))
end

2 changes: 1 addition & 1 deletion src/fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ function reg(df,
progress_bar::Bool = true,
subset::Union{Nothing, AbstractVector} = nothing,
first_stage::Bool = true)
StatsAPI.fit(FixedEffectModel, formula, df, vcov; contrasts = contrasts, weights = weights, save = save, method = method, double_precision = double_precision, tol = tol, maxiter = maxiter, drop_singletons = drop_singletons, progress_bar = progress_bar, subset = subset, first_stage = first_stage)
StatsAPI.fit(FixedEffectModel, formula, df, vcov; contrasts = contrasts, weights = weights, save = save, method = method, nthreads = nthreads, double_precision = double_precision, tol = tol, maxiter = maxiter, drop_singletons = drop_singletons, progress_bar = progress_bar, subset = subset, first_stage = first_stage)
end

function StatsAPI.fit(::Type{FixedEffectModel},
Expand Down
10 changes: 8 additions & 2 deletions src/partial_out.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ function partial_out(
@nospecialize(tol::Real = double_precision ? 1e-8 : 1e-6),
@nospecialize(align = true))

# Normalize generic Tables.jl input once; the rest of the function uses
# DataFrame indexing/views, and copycols = false avoids copies when possible.
df = DataFrame(df; copycols = false)

if (ConstantTerm(0) ∉ eachterm(f.rhs)) && (ConstantTerm(1) ∉ eachterm(f.rhs))
f = FormulaTerm(f.lhs, tuple(ConstantTerm(1), eachterm(f.rhs)...))
end
Expand All @@ -53,8 +57,10 @@ function partial_out(


# create a dataframe without missing values & negative weights
all_vars = StatsModels.termvars(formula)
esample = completecases(df[!, all_vars])
main_vars = unique(StatsModels.termvars(formula))
fe_vars = unique(StatsModels.termvars(formula_fes))
all_vars = unique(vcat(main_vars, fe_vars))
esample = completecases(df, all_vars)
if has_weights
esample .&= BitArray(!ismissing(x) && (x > 0) for x in df[!, weights])
end
Expand Down
28 changes: 28 additions & 0 deletions test/fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -750,3 +750,31 @@ end
x = reg(df1, @formula(a ~ b + fe(c)))
@test coef(x) ≈ [0.5] atol = 1e-4
end

@testset "keyword arguments" begin
df = DataFrame(y = [1.0, 2.0, 2.5], x = [0.0, 1.0, 2.0])
@test_logs (:info, r"The keyword argument nthreads is deprecated") reg(df, @formula(y ~ x), nthreads = 1)
@test_logs (:info, r"method = :gpu is deprecated") reg(df, @formula(y ~ x), method = :gpu)
end

@testset "error handling" begin
df = DataFrame(y = [1.0, 2.0, 3.0], x = [0.0, 1.0, 2.0], z = [1.0, 2.0, 3.0], w = [1.0, 2.0, Inf])

@test_throws ArgumentError reg(df, @formula(y ~ x), save = :bad)
@test_throws DimensionMismatch reg(df, @formula(y ~ x), subset = [true, false])
@test_throws ArgumentError reg(df, @formula(y ~ x), subset = falses(3))
@test_throws ArgumentError adjr2(reg(df, @formula(y ~ x)), :bad)
@test_throws ArgumentError reg(df, @formula(y ~ x), weights = :w)

df_yinf = DataFrame(y = [1.0, Inf, 3.0], x = [0.0, 1.0, 2.0])
@test_throws ArgumentError reg(df_yinf, @formula(y ~ x))

df_xinf = DataFrame(y = [1.0, 2.0, 3.0], x = [0.0, Inf, 2.0])
@test_throws ArgumentError reg(df_xinf, @formula(y ~ x))

df_endoinf = DataFrame(y = [1.0, 2.0, 3.0], x = [0.0, Inf, 2.0], z = [1.0, 2.0, 3.0])
@test_throws ArgumentError reg(df_endoinf, @formula(y ~ (x ~ z)))

df_ivinf = DataFrame(y = [1.0, 2.0, 3.0], x = [0.0, 1.0, 2.0], z = [1.0, Inf, 3.0])
@test_throws ArgumentError reg(df_ivinf, @formula(y ~ (x ~ z)))
end
16 changes: 16 additions & 0 deletions test/partial_out.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,21 @@ using FixedEffectModels, DataFrames, Statistics, CSV, Test
@test Matrix(partial_out(df, @formula(Sales + Price ~ NDI + fe(State)), weights = :Pop)[1])[1:5, :] ≈ [ -22.2429 -1.2635 ; -20.5296 -1.1515 ; -17.2164 -2.23949; -19.1378 -1.45057; -19.854 -2.28819] atol = 1e-3
@test Matrix(partial_out(df, @formula(Sales + Price ~ 1 + fe(State)), weights = :Pop)[1])[1:5, :] ≈ [ -14.0383 -43.1224; -12.5383 -41.9224; -9.43825 -41.9224; -11.5383 -40.2224; -12.4383 -40.1224] atol = 1e-3
@test Matrix(partial_out(df, @formula(Sales + Price ~ 1), weights = :Pop)[1])[1:5, :] ≈ [ -26.3745 -44.9103; -24.8745 -43.7103; -21.7745 -43.7103; -23.8745 -42.0103; -24.7745 -41.9103] atol = 1e-3

df_missing = DataFrame(y = [1.0, 2.0, 3.0, 4.0], g = ["a", missing, "a", "b"])
out_missing, esample_missing, _, _ = partial_out(df_missing, @formula(y ~ 1 + fe(g)))
out_nomissing, esample_nomissing, _, _ = partial_out(dropmissing(df_missing, :g), @formula(y ~ 1 + fe(g)); align = false)
@test esample_missing == [true, false, true, true]
@test all(esample_nomissing)
@test ismissing(out_missing.y[2])
@test collect(skipmissing(out_missing.y)) ≈ out_nomissing.y
end

@testset "partial out errors" begin
df = DataFrame(y = [1.0, 2.0, 3.0], x = [0.0, 1.0, 2.0], z = [1.0, 2.0, 3.0], w = [1.0, 2.0, Inf])

@test_throws ArgumentError partial_out(df, @formula(y ~ (x ~ z)))
@test_throws ArgumentError partial_out(df, @formula(y ~ x), weights = :w)
@test_throws ArgumentError partial_out(DataFrame(y = [missing, missing], x = [1.0, 2.0]), @formula(y ~ x))
end

40 changes: 38 additions & 2 deletions test/predict.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using FixedEffectModels, DataFrames, CategoricalArrays, CSV, Test
using CUDA, Metal



Expand Down Expand Up @@ -38,6 +39,17 @@ using FixedEffectModels, DataFrames, CategoricalArrays, CSV, Test
end

@testset "Predict" begin
# Intercept only
df = DataFrame(y = [1.0, 2.0, 3.0])
m = reg(df, @formula(y ~ 1))
pred = predict(m, df)
@test pred ≈ fill(2.0, 3)

# No regressors
m = reg(df, @formula(y ~ 0))
pred = predict(m, df)
@test pred ≈ zeros(3)

# Simple - one binary FE
df = DataFrame(x = rand(100), g = rand(["a", "b"], 100))
df.y = 1.0 .+ 0.5 .* df.x .+ (df.g .== "b")
Expand Down Expand Up @@ -155,6 +167,12 @@ end
m = reg(df, @formula(y ~ x + fe(g1) + fe(g2)&z + fe(g1)&x); save = :fe)
@test_throws ArgumentError pred = predict(m, df)

# FE/continuous interaction as the only regressor
df = DataFrame(g = rand(["a", "b"], 100), z = rand(100))
df.y = 1.5 .+ 2.0 .* (df.g .== "b") .* df.z
m = reg(df, @formula(y ~ fe(g)&z); save = :fe)
@test_throws ArgumentError pred = predict(m, df)

# Regular FE + interacted FE + FE/continuous interaction
df = DataFrame(x = rand(100), g1 = rand(["a", "b"], 100), g2 = rand(["c", "d"], 100),
g3 = rand(["e", "f"], 100), g4 = rand(["g", "h"], 100), z = rand(100))
Expand All @@ -171,6 +189,20 @@ end
@test all(skipmissing(out1 .≈ out2))
end

@testset "predict/residual error paths" begin
df = DataFrame(y = [1.0, 2.0, 3.0], x = [0.0, 1.0, 2.0], g = ["a", "a", "b"])

@test_throws ArgumentError predict(reg(df, @formula(y ~ x)), 1)
@test_throws ArgumentError residuals(reg(df, @formula(y ~ x)), 1)
@test_throws ArgumentError fe(reg(df, @formula(y ~ x)))
@test_throws ArgumentError residuals(reg(df, @formula(y ~ x)))

m_fe = reg(df, @formula(y ~ x + fe(g)))
@test_throws ArgumentError predict(m_fe, df)
@test_throws ArgumentError residuals(m_fe, df)
@test_throws ArgumentError residuals(m_fe)
end

@testset "Continuous/FE detection" begin
# Regular interaction is fine as handled by StatsModels
@test FixedEffectModels.has_cont_fe_interaction(@formula(y ~ x + y&z)) == false
Expand All @@ -179,6 +211,7 @@ end
@test FixedEffectModels.has_cont_fe_interaction(@formula(y ~ x + fe(y)&fe(z))) == false

# Interaction of FEs with continuous variable requires special handling, currently not implemented
@test FixedEffectModels.has_cont_fe_interaction(@formula(y ~ fe(y)&z)) == true
@test FixedEffectModels.has_cont_fe_interaction(@formula(y ~ x + fe(y)&z)) == true
@test FixedEffectModels.has_cont_fe_interaction(@formula(y ~ x + y&fe(z))) == true
@test FixedEffectModels.has_cont_fe_interaction(@formula(y ~ x + fe(y)&fe(z)&a)) == true
Expand All @@ -190,6 +223,11 @@ end

@testset "residuals" begin

df = DataFrame(y = [1.0, 2.0, 3.0])
model = @formula y ~ 1
result = reg(df, model)
@test residuals(result, df) ≈ [-1.0, 0.0, 1.0]

df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv")))
df.StateC = categorical(df.State)

Expand Down Expand Up @@ -408,5 +446,3 @@ end
@test fe(result)[1, :fe_Year] ≈ 164.7 atol = 1e-1
end
end


Loading