Common MLJ Workflows

Data ingestion

import RDatasets
channing = RDatasets.dataset("boot", "channing")
first(channing, 4)

4 rows × 5 columns

SexEntryExitTimeCens
Cat…Int32Int32Int32Int32
1Male7829091271
2Male102011281081
3Male8569691131
4Male915957421

Inspecting metadata, including column scientific types:

schema(channing)
┌─────────┬────────────────────────────────┬───────────────┐
│ _.names │ _.types                        │ _.scitypes    │
├─────────┼────────────────────────────────┼───────────────┤
│ Sex     │ CategoricalValue{String,UInt8} │ Multiclass{2} │
│ Entry   │ Int32                          │ Count         │
│ Exit    │ Int32                          │ Count         │
│ Time    │ Int32                          │ Count         │
│ Cens    │ Int32                          │ Count         │
└─────────┴────────────────────────────────┴───────────────┘
_.nrows = 462

Unpacking data and correcting for wrong scitypes:

y, X =  unpack(channing,
               ==(:Exit),            # y is the :Exit column
               !=(:Time);            # X is the rest, except :Time
               :Exit=>Continuous,
               :Entry=>Continuous,
               :Cens=>Multiclass)
first(X, 4)

4 rows × 3 columns

SexEntryCens
Cat…Float64Cat…
1Male782.01
2Male1020.01
3Male856.01
4Male915.01

Note: Before julia 1.2, replace !=(:Time) with col -> col != :Time.

y[1:4]
4-element Array{Float64,1}:
  909.0
 1128.0
  969.0
  957.0

Loading a built-in supervised dataset:

X, y = @load_iris;
selectrows(X, 1:4) # selectrows works for any Tables.jl table
(sepal_length = [5.1, 4.9, 4.7, 4.6],
 sepal_width = [3.5, 3.0, 3.2, 3.1],
 petal_length = [1.4, 1.4, 1.3, 1.5],
 petal_width = [0.2, 0.2, 0.2, 0.2],)
y[1:4]
4-element CategoricalArray{String,1,UInt32}:
 "setosa"
 "setosa"
 "setosa"
 "setosa"

Model search

Reference: Model Search

Searching for a supervised model:

X, y = @load_boston
models(matching(X, y))
57-element Array{NamedTuple{(:name, :package_name, :is_supervised, :docstring, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :is_pure_julia, :is_wrapper, :load_path, :package_license, :package_url, :package_uuid, :prediction_type, :supports_online, :supports_weights, :input_scitype, :target_scitype, :output_scitype),T} where T<:Tuple,1}:
 (name = ARDRegressor, package_name = ScikitLearn, ... )
 (name = AdaBoostRegressor, package_name = ScikitLearn, ... )
 (name = BaggingRegressor, package_name = ScikitLearn, ... )
 (name = BayesianRidgeRegressor, package_name = ScikitLearn, ... )
 (name = ConstantRegressor, package_name = MLJModels, ... )
 (name = DecisionTreeRegressor, package_name = DecisionTree, ... )
 (name = DeterministicConstantRegressor, package_name = MLJModels, ... )
 (name = DummyRegressor, package_name = ScikitLearn, ... )
 (name = ElasticNetCVRegressor, package_name = ScikitLearn, ... )
 (name = ElasticNetRegressor, package_name = MLJLinearModels, ... )
 ⋮
 (name = RidgeRegressor, package_name = MultivariateStats, ... )
 (name = RidgeRegressor, package_name = ScikitLearn, ... )
 (name = RobustRegressor, package_name = MLJLinearModels, ... )
 (name = SGDRegressor, package_name = ScikitLearn, ... )
 (name = SVMLinearRegressor, package_name = ScikitLearn, ... )
 (name = SVMNuRegressor, package_name = ScikitLearn, ... )
 (name = SVMRegressor, package_name = ScikitLearn, ... )
 (name = TheilSenRegressor, package_name = ScikitLearn, ... )
 (name = XGBoostRegressor, package_name = XGBoost, ... )
models(matching(X, y))[6]
CART decision tree regressor.
→ based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl).
→ do `@load DecisionTreeRegressor pkg="DecisionTree"` to use the model.
→ do `?DecisionTreeRegressor` for documentation.
(name = "DecisionTreeRegressor",
 package_name = "DecisionTree",
 is_supervised = true,
 docstring = "CART decision tree regressor.\n→ based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl).\n→ do `@load DecisionTreeRegressor pkg=\"DecisionTree\"` to use the model.\n→ do `?DecisionTreeRegressor` for documentation.",
 hyperparameter_ranges = (nothing, nothing, nothing, nothing, nothing, nothing, nothing),
 hyperparameter_types = ("Int64", "Int64", "Int64", "Float64", "Int64", "Bool", "Float64"),
 hyperparameters = (:max_depth, :min_samples_leaf, :min_samples_split, :min_purity_increase, :n_subfeatures, :post_prune, :merge_purity_threshold),
 implemented_methods = [:predict, :clean!, :fit, :fitted_params],
 is_pure_julia = true,
 is_wrapper = false,
 load_path = "MLJDecisionTreeInterface.DecisionTreeRegressor",
 package_license = "MIT",
 package_url = "https://github.com/bensadeghi/DecisionTree.jl",
 package_uuid = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb",
 prediction_type = :deterministic,
 supports_online = false,
 supports_weights = false,
 input_scitype = Table{_s24} where _s24<:Union{AbstractArray{_s23,1} where _s23<:Continuous, AbstractArray{_s23,1} where _s23<:Count, AbstractArray{_s23,1} where _s23<:OrderedFactor},
 target_scitype = AbstractArray{Continuous,1},
 output_scitype = Unknown,)

More refined searches:

models() do model
    matching(model, X, y) &&
    model.prediction_type == :deterministic &&
    model.is_pure_julia
end
18-element Array{NamedTuple{(:name, :package_name, :is_supervised, :docstring, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :is_pure_julia, :is_wrapper, :load_path, :package_license, :package_url, :package_uuid, :prediction_type, :supports_online, :supports_weights, :input_scitype, :target_scitype, :output_scitype),T} where T<:Tuple,1}:
 (name = DecisionTreeRegressor, package_name = DecisionTree, ... )
 (name = DeterministicConstantRegressor, package_name = MLJModels, ... )
 (name = ElasticNetRegressor, package_name = MLJLinearModels, ... )
 (name = EvoTreeRegressor, package_name = EvoTrees, ... )
 (name = HuberRegressor, package_name = MLJLinearModels, ... )
 (name = KNNRegressor, package_name = NearestNeighbors, ... )
 (name = KPLSRegressor, package_name = PartialLeastSquaresRegressor, ... )
 (name = LADRegressor, package_name = MLJLinearModels, ... )
 (name = LassoRegressor, package_name = MLJLinearModels, ... )
 (name = LinearRegressor, package_name = MLJLinearModels, ... )
 (name = LinearRegressor, package_name = MultivariateStats, ... )
 (name = NeuralNetworkRegressor, package_name = MLJFlux, ... )
 (name = PLSRegressor, package_name = PartialLeastSquaresRegressor, ... )
 (name = QuantileRegressor, package_name = MLJLinearModels, ... )
 (name = RandomForestRegressor, package_name = DecisionTree, ... )
 (name = RidgeRegressor, package_name = MLJLinearModels, ... )
 (name = RidgeRegressor, package_name = MultivariateStats, ... )
 (name = RobustRegressor, package_name = MLJLinearModels, ... )

Searching for an unsupervised model:

models(matching(X))
24-element Array{NamedTuple{(:name, :package_name, :is_supervised, :docstring, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :is_pure_julia, :is_wrapper, :load_path, :package_license, :package_url, :package_uuid, :prediction_type, :supports_online, :supports_weights, :input_scitype, :target_scitype, :output_scitype),T} where T<:Tuple,1}:
 (name = AffinityPropagation, package_name = ScikitLearn, ... )
 (name = AgglomerativeClustering, package_name = ScikitLearn, ... )
 (name = Birch, package_name = ScikitLearn, ... )
 (name = ContinuousEncoder, package_name = MLJModels, ... )
 (name = DBSCAN, package_name = ScikitLearn, ... )
 (name = FactorAnalysis, package_name = MultivariateStats, ... )
 (name = FeatureAgglomeration, package_name = ScikitLearn, ... )
 (name = FeatureSelector, package_name = MLJModels, ... )
 (name = FillImputer, package_name = MLJModels, ... )
 (name = ICA, package_name = MultivariateStats, ... )
 ⋮
 (name = MeanShift, package_name = ScikitLearn, ... )
 (name = MiniBatchKMeans, package_name = ScikitLearn, ... )
 (name = OPTICS, package_name = ScikitLearn, ... )
 (name = OneClassSVM, package_name = LIBSVM, ... )
 (name = OneHotEncoder, package_name = MLJModels, ... )
 (name = PCA, package_name = MultivariateStats, ... )
 (name = PPCA, package_name = MultivariateStats, ... )
 (name = SpectralClustering, package_name = ScikitLearn, ... )
 (name = Standardizer, package_name = MLJModels, ... )

Getting the metadata entry for a given model type:

info("PCA")
info("RidgeRegressor", pkg="MultivariateStats") # a model type in multiple packages
Ridge regressor with regularization parameter lambda. Learns a
linear regression with a penalty on the l2 norm of the coefficients.

→ based on [MultivariateStats](https://github.com/JuliaStats/MultivariateStats.jl).
→ do `@load RidgeRegressor pkg="MultivariateStats"` to use the model.
→ do `?RidgeRegressor` for documentation.
(name = "RidgeRegressor",
 package_name = "MultivariateStats",
 is_supervised = true,
 docstring = "Ridge regressor with regularization parameter lambda. Learns a\nlinear regression with a penalty on the l2 norm of the coefficients.\n\n→ based on [MultivariateStats](https://github.com/JuliaStats/MultivariateStats.jl).\n→ do `@load RidgeRegressor pkg=\"MultivariateStats\"` to use the model.\n→ do `?RidgeRegressor` for documentation.",
 hyperparameter_ranges = (nothing, nothing),
 hyperparameter_types = ("Union{Real, Union{AbstractArray{T,1}, AbstractArray{T,2}} where T}", "Bool"),
 hyperparameters = (:lambda, :bias),
 implemented_methods = [:predict, :clean!, :fit, :fitted_params],
 is_pure_julia = true,
 is_wrapper = false,
 load_path = "MLJMultivariateStatsInterface.RidgeRegressor",
 package_license = "MIT",
 package_url = "https://github.com/JuliaStats/MultivariateStats.jl",
 package_uuid = "6f286f6a-111f-5878-ab1e-185364afe411",
 prediction_type = :deterministic,
 supports_online = false,
 supports_weights = false,
 input_scitype = Table{_s24} where _s24<:(AbstractArray{_s23,1} where _s23<:Continuous),
 target_scitype = Union{AbstractArray{Continuous,1}, Table{_s24} where _s24<:(AbstractArray{_s23,1} where _s23<:Continuous)},
 output_scitype = Unknown,)

Instantiating a model

Reference: Getting Started

@load DecisionTreeClassifier
model = DecisionTreeClassifier(min_samples_split=5, max_depth=4)
DecisionTreeClassifier(
    max_depth = 4,
    min_samples_leaf = 1,
    min_samples_split = 5,
    min_purity_increase = 0.0,
    n_subfeatures = 0,
    post_prune = false,
    merge_purity_threshold = 1.0,
    pdf_smoothing = 0.0,
    display_depth = 5) @279

or

model = @load DecisionTreeClassifier
model.min_samples_split = 5
model.max_depth = 4

Evaluating a model

Reference: Evaluating Model Performance

X, y = @load_boston
model = @load KNNRegressor
evaluate(model, X, y, resampling=CV(nfolds=5), measure=[rms, mav])
┌───────────────────────────┬───────────────┬───────────────────────────────┐
│ _.measure                 │ _.measurement │ _.per_fold                    │
├───────────────────────────┼───────────────┼───────────────────────────────┤
│ RootMeanSquaredError @712 │ 8.77          │ [8.53, 8.8, 10.7, 9.43, 5.59] │
│ MeanAbsoluteError @587    │ 6.02          │ [6.52, 5.7, 7.65, 6.09, 4.11] │
└───────────────────────────┴───────────────┴───────────────────────────────┘
_.per_observation = [missing, missing]
_.fitted_params_per_fold = [ … ]
_.report_per_fold = [ … ]

Basic fit/evaluate/predict by hand:

Reference: Getting Started, Machines, Evaluating Model Performance, Performance Measures

import RDatasets
vaso = RDatasets.dataset("robustbase", "vaso"); # a DataFrame
first(vaso, 3)

3 rows × 3 columns

VolumeRateY
Float64Float64Int64
13.70.8251
23.51.091
31.252.51
y, X = unpack(vaso, ==(:Y), c -> true; :Y => Multiclass)

tree_model = @load DecisionTreeClassifier
[ Info: For silent loading, specify `verbosity=0`.
[ Info: Model code for DecisionTreeClassifier already loaded
(MLJDecisionTreeInterface.DecisionTreeClassifier)() ✔

Bind the model and data together in a machine , which will additionally store the learned parameters (fitresults) when fit:

tree = machine(tree_model, X, y)
Machine{DecisionTreeClassifier} @025 trained 0 times.
  args: 
    1:	Source @611 ⏎ `Table{AbstractArray{Continuous,1}}`
    2:	Source @529 ⏎ `AbstractArray{Multiclass{2},1}`

Split row indices into training and evaluation rows:

train, test = partition(eachindex(y), 0.7, shuffle=true, rng=1234); # 70:30 split
([27, 28, 30, 31, 32, 18, 21, 9, 26, 14  …  7, 39, 2, 37, 1, 8, 19, 25, 35, 34], [22, 13, 11, 4, 10, 16, 3, 20, 29, 23, 12, 24])

Fit on train and evaluate on test:

fit!(tree, rows=train)
yhat = predict(tree, X[test,:])
mean(cross_entropy(yhat, y[test]))
6.5216583816514975

Predict on new data:

Xnew = (Volume=3*rand(3), Rate=3*rand(3))
predict(tree, Xnew)      # a vector of distributions
3-element MLJBase.UnivariateFiniteArray{Multiclass{2},Int64,UInt32,Float64,1}:
 UnivariateFinite{Multiclass{2}}(0=>0.273, 1=>0.727)
 UnivariateFinite{Multiclass{2}}(0=>0.9, 1=>0.1)
 UnivariateFinite{Multiclass{2}}(0=>0.273, 1=>0.727)
predict_mode(tree, Xnew) # a vector of point-predictions
3-element CategoricalArray{Int64,1,UInt32}:
 1
 0
 1

More performance evaluation examples

import LossFunctions.ZeroOneLoss

Evaluating model + data directly:

evaluate(tree_model, X, y,
         resampling=Holdout(fraction_train=0.7, shuffle=true, rng=1234),
         measure=[cross_entropy, ZeroOneLoss()])
┌───────────────────────┬───────────────┬────────────┐
│ _.measure             │ _.measurement │ _.per_fold │
├───────────────────────┼───────────────┼────────────┤
│ LogLoss{Float64} @401 │ 6.52          │ [6.52]     │
│ ZeroOneLoss           │ 0.417         │ [0.417]    │
└───────────────────────┴───────────────┴────────────┘
_.per_observation = [[[0.105, 36.0, ..., 1.3]], [[0.0, 1.0, ..., 1.0]]]
_.fitted_params_per_fold = [ … ]
_.report_per_fold = [ … ]

If a machine is already defined, as above:

evaluate!(tree,
          resampling=Holdout(fraction_train=0.7, shuffle=true, rng=1234),
          measure=[cross_entropy, ZeroOneLoss()])
┌───────────────────────┬───────────────┬────────────┐
│ _.measure             │ _.measurement │ _.per_fold │
├───────────────────────┼───────────────┼────────────┤
│ LogLoss{Float64} @401 │ 6.52          │ [6.52]     │
│ ZeroOneLoss           │ 0.417         │ [0.417]    │
└───────────────────────┴───────────────┴────────────┘
_.per_observation = [[[0.105, 36.0, ..., 1.3]], [[0.0, 1.0, ..., 1.0]]]
_.fitted_params_per_fold = [ … ]
_.report_per_fold = [ … ]

Using cross-validation:

evaluate!(tree, resampling=CV(nfolds=5, shuffle=true, rng=1234),
          measure=[cross_entropy, ZeroOneLoss()])
┌───────────────────────┬───────────────┬──────────────────────────────────┐
│ _.measure             │ _.measurement │ _.per_fold                       │
├───────────────────────┼───────────────┼──────────────────────────────────┤
│ LogLoss{Float64} @401 │ 3.27          │ [9.25, 0.598, 4.93, 1.07, 0.523] │
│ ZeroOneLoss           │ 0.436         │ [0.5, 0.375, 0.375, 0.5, 0.429]  │
└───────────────────────┴───────────────┴──────────────────────────────────┘
_.per_observation = [[[2.22e-16, 0.944, ..., 2.22e-16], [0.847, 0.56, ..., 0.56], [0.799, 0.598, ..., 36.0], [2.01, 2.01, ..., 0.143], [0.847, 2.22e-16, ..., 0.56]], [[0.0, 1.0, ..., 0.0], [1.0, 0.0, ..., 0.0], [1.0, 0.0, ..., 1.0], [1.0, 1.0, ..., 0.0], [1.0, 0.0, ..., 0.0]]]
_.fitted_params_per_fold = [ … ]
_.report_per_fold = [ … ]

With user-specified train/test pairs of row indices:

f1, f2, f3 = 1:13, 14:26, 27:36
pairs = [(f1, vcat(f2, f3)), (f2, vcat(f3, f1)), (f3, vcat(f1, f2))];
evaluate!(tree,
          resampling=pairs,
          measure=[cross_entropy, ZeroOneLoss()])
┌───────────────────────┬───────────────┬───────────────────────┐
│ _.measure             │ _.measurement │ _.per_fold            │
├───────────────────────┼───────────────┼───────────────────────┤
│ LogLoss{Float64} @401 │ 5.88          │ [2.16, 11.0, 4.51]    │
│ ZeroOneLoss           │ 0.241         │ [0.304, 0.304, 0.115] │
└───────────────────────┴───────────────┴───────────────────────┘
_.per_observation = [[[0.154, 0.154, ..., 0.154], [2.22e-16, 36.0, ..., 2.22e-16], [2.22e-16, 2.22e-16, ..., 0.693]], [[0.0, 0.0, ..., 0.0], [0.0, 1.0, ..., 0.0], [0.0, 0.0, ..., 0.0]]]
_.fitted_params_per_fold = [ … ]
_.report_per_fold = [ … ]

Changing a hyperparameter and re-evaluating:

tree_model.max_depth = 3
evaluate!(tree,
          resampling=CV(nfolds=5, shuffle=true, rng=1234),
          measure=[cross_entropy, ZeroOneLoss()])
┌───────────────────────┬───────────────┬────────────────────────────────────┐
│ _.measure             │ _.measurement │ _.per_fold                         │
├───────────────────────┼───────────────┼────────────────────────────────────┤
│ LogLoss{Float64} @401 │ 2.23          │ [9.18, 0.484, 0.427, 0.564, 0.488] │
│ ZeroOneLoss           │ 0.307         │ [0.375, 0.25, 0.25, 0.375, 0.286]  │
└───────────────────────┴───────────────┴────────────────────────────────────┘
_.per_observation = [[[2.22e-16, 1.32, ..., 2.22e-16], [2.22e-16, 0.318, ..., 0.318], [0.405, 2.22e-16, ..., 2.22e-16], [1.5, 1.5, ..., 2.22e-16], [0.636, 2.22e-16, ..., 0.754]], [[0.0, 1.0, ..., 0.0], [0.0, 0.0, ..., 0.0], [0.0, 0.0, ..., 0.0], [1.0, 1.0, ..., 0.0], [0.0, 0.0, ..., 1.0]]]
_.fitted_params_per_fold = [ … ]
_.report_per_fold = [ … ]

Inspecting training results

Fit a ordinary least square model to some synthetic data:

x1 = rand(100)
x2 = rand(100)

X = (x1=x1, x2=x2)
y = x1 - 2x2 + 0.1*rand(100);

ols_model = @load LinearRegressor pkg=GLM
ols =  machine(ols_model, X, y)
fit!(ols)
Machine{LinearRegressor} @529 trained 1 time.
  args: 
    1:	Source @031 ⏎ `Table{AbstractArray{Continuous,1}}`
    2:	Source @106 ⏎ `AbstractArray{Continuous,1}`

Get a named tuple representing the learned parameters, human-readable if appropriate:

fitted_params(ols)
(coef = [1.0096153485040382, -1.9927799728153508],
 intercept = 0.04179914671077211,)

Get other training-related information:

report(ols)
(deviance = 0.07518173976825744,
 dof_residual = 97.0,
 stderror = [0.010090175571601133, 0.009856737002345324, 0.007030644773504755],
 vcov = [0.00010181164306573626 -5.031432941773656e-6 -4.6572274444750014e-5; -5.031432941773656e-6 9.715526433340349e-5 -4.2133971528516224e-5; -4.6572274444750014e-5 -4.2133971528516224e-5 4.942996593120973e-5],)

Basic fit/transform for unsupervised models

Load data:

X, y = @load_iris
train, test = partition(eachindex(y), 0.97, shuffle=true, rng=123)
([125, 100, 130, 9, 70, 148, 39, 64, 6, 107  …  110, 59, 139, 21, 112, 144, 140, 72, 109, 41], [106, 147, 47, 5])

Instantiate and fit the model/machine:

@load PCA
pca_model = PCA(maxoutdim=2)
pca = machine(pca_model, X)
fit!(pca, rows=train)
Machine{PCA} @859 trained 1 time.
  args: 
    1:	Source @601 ⏎ `Table{AbstractArray{Continuous,1}}`

Transform selected data bound to the machine:

transform(pca, rows=test);
(x1 = [-3.3942826854483243, -1.5219827578765068, 2.538247455185219, 2.7299639893931373],
 x2 = [0.5472450223745241, -0.36842368617126214, 0.5199299511335698, 0.3448466122232363],)

Transform new data:

Xnew = (sepal_length=rand(3), sepal_width=rand(3),
        petal_length=rand(3), petal_width=rand(3));
transform(pca, Xnew)
(x1 = [4.999463370495924, 4.988317717855254, 4.733355784521376],
 x2 = [-4.401669016087197, -4.9607144197791735, -4.644421494820152],)

Inverting learned transformations

y = rand(100);
stand_model = UnivariateStandardizer()
stand = machine(stand_model, y)
fit!(stand)
z = transform(stand, y);
@assert inverse_transform(stand, z) ≈ y # true
[ Info: Training Machine{UnivariateStandardizer} @480.

Nested hyperparameter tuning

Reference: Tuning Models

Define a model with nested hyperparameters:

tree_model = @load DecisionTreeClassifier
forest_model = EnsembleModel(atom=tree_model, n=300)
ProbabilisticEnsembleModel(
    atom = DecisionTreeClassifier(
            max_depth = -1,
            min_samples_leaf = 1,
            min_samples_split = 2,
            min_purity_increase = 0.0,
            n_subfeatures = 0,
            post_prune = false,
            merge_purity_threshold = 1.0,
            pdf_smoothing = 0.0,
            display_depth = 5),
    atomic_weights = Float64[],
    bagging_fraction = 0.8,
    rng = Random._GLOBAL_RNG(),
    n = 300,
    acceleration = CPU1{Nothing}(nothing),
    out_of_bag_measure = Any[]) @419

Inspect all hyperparameters, even nested ones (returns nested named tuple):

params(forest_model)
(atom = (max_depth = -1,
         min_samples_leaf = 1,
         min_samples_split = 2,
         min_purity_increase = 0.0,
         n_subfeatures = 0,
         post_prune = false,
         merge_purity_threshold = 1.0,
         pdf_smoothing = 0.0,
         display_depth = 5,),
 atomic_weights = Float64[],
 bagging_fraction = 0.8,
 rng = Random._GLOBAL_RNG(),
 n = 300,
 acceleration = CPU1{Nothing}(nothing),
 out_of_bag_measure = Any[],)

Define ranges for hyperparameters to be tuned:

r1 = range(forest_model, :bagging_fraction, lower=0.5, upper=1.0, scale=:log10)
MLJBase.NumericRange(Float64, :bagging_fraction, ... )
r2 = range(forest_model, :(atom.n_subfeatures), lower=1, upper=4) # nested
MLJBase.NumericRange(Int64, :(atom.n_subfeatures), ... )

Wrap the model in a tuning strategy:

tuned_forest = TunedModel(model=forest_model,
                          tuning=Grid(resolution=12),
                          resampling=CV(nfolds=6),
                          ranges=[r1, r2],
                          measure=cross_entropy)
ProbabilisticTunedModel(
    model = ProbabilisticEnsembleModel(
            atom = DecisionTreeClassifier @745,
            atomic_weights = Float64[],
            bagging_fraction = 0.8,
            rng = Random._GLOBAL_RNG(),
            n = 300,
            acceleration = CPU1{Nothing}(nothing),
            out_of_bag_measure = Any[]),
    tuning = Grid(
            goal = nothing,
            resolution = 12,
            shuffle = true,
            rng = Random._GLOBAL_RNG()),
    resampling = CV(
            nfolds = 6,
            shuffle = false,
            rng = Random._GLOBAL_RNG()),
    measure = LogLoss(
            tol = 2.220446049250313e-16),
    weights = nothing,
    operation = MLJModelInterface.predict,
    range = MLJBase.NumericRange{T,MLJBase.Bounded,Symbol} where T[NumericRange{Float64,…} @706, NumericRange{Int64,…} @277],
    selection_heuristic = MLJTuning.NaiveSelection(nothing),
    train_best = true,
    repeats = 1,
    n = nothing,
    acceleration = CPU1{Nothing}(nothing),
    acceleration_resampling = CPU1{Nothing}(nothing),
    check_measure = true) @843

Bound the wrapped model to data:

tuned = machine(tuned_forest, X, y)
Machine{ProbabilisticTunedModel{Grid,…}} @317 trained 0 times.
  args: 
    1:	Source @496 ⏎ `Table{AbstractArray{Continuous,1}}`
    2:	Source @283 ⏎ `AbstractArray{Multiclass{3},1}`

Fitting the resultant machine optimizes the hyperparameters specified in range, using the specified tuning and resampling strategies and performance measure (possibly a vector of measures), and retrains on all data bound to the machine:

fit!(tuned)
Machine{ProbabilisticTunedModel{Grid,…}} @317 trained 1 time.
  args: 
    1:	Source @496 ⏎ `Table{AbstractArray{Continuous,1}}`
    2:	Source @283 ⏎ `AbstractArray{Multiclass{3},1}`

Inspecting the optimal model:

F = fitted_params(tuned)
(best_model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @322,
 best_fitted_params = (fitresult = WrappedEnsemble{Tuple{Node{Float64,…},…},…} @500,),)
F.best_model
ProbabilisticEnsembleModel(
    atom = DecisionTreeClassifier(
            max_depth = -1,
            min_samples_leaf = 1,
            min_samples_split = 2,
            min_purity_increase = 0.0,
            n_subfeatures = 3,
            post_prune = false,
            merge_purity_threshold = 1.0,
            pdf_smoothing = 0.0,
            display_depth = 5),
    atomic_weights = Float64[],
    bagging_fraction = 0.5325205447199813,
    rng = Random._GLOBAL_RNG(),
    n = 300,
    acceleration = CPU1{Nothing}(nothing),
    out_of_bag_measure = Any[]) @322

Inspecting details of tuning procedure:

report(tuned)
(best_model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @322,
 best_history_entry = (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @322,
                       measure = LogLoss{Float64}[LogLoss{Float64} @401],
                       measurement = [0.15165116029290174],
                       per_fold = [[3.663735981263026e-15, 3.663735981263026e-15, 0.21923421291254536, 0.2519509365015743, 0.20632317209433484, 0.23239864024894868]],),
 history = NamedTuple{(:model, :measure, :measurement, :per_fold),Tuple{MLJ.ProbabilisticEnsembleModel{MLJDecisionTreeInterface.DecisionTreeClassifier},Array{LogLoss{Float64},1},Array{Float64,1},Array{Array{Float64,1},1}}}[(model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @569, measure = [LogLoss{Float64} @401], measurement = [0.16313010263421743], per_fold = [[3.663735981263026e-15, 3.663735981263026e-15, 0.214620219113949, 0.2493036438186952, 0.23761879904739142, 0.27723795382526173]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @569, measure = [LogLoss{Float64} @401], measurement = [2.423637072586287], per_fold = [[3.663735981263026e-15, 3.663735981263026e-15, 4.353231853442493, 2.883492271129374, 2.896632433808258, 4.40846587713759]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @851, measure = [LogLoss{Float64} @401], measurement = [0.21494041076689133], per_fold = [[0.06491068877237877, 0.02628518421075993, 0.3153976546900662, 0.2704380882535723, 0.32249601521720683, 0.290114833457364]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @921, measure = [LogLoss{Float64} @401], measurement = [0.1628785360876341], per_fold = [[0.03449742776901644, 0.008044123154757303, 0.22998473417600976, 0.225127373351961, 0.23794163011742975, 0.2416759279566304]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @702, measure = [LogLoss{Float64} @401], measurement = [0.15947693421163087], per_fold = [[3.663735981263026e-15, 3.663735981263026e-15, 0.2199362541979453, 0.2515606875733344, 0.25144403837975204, 0.2339206251187461]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @648, measure = [LogLoss{Float64} @401], measurement = [0.4099108325302428], per_fold = [[0.04687519253542851, 0.01187673823255925, 0.26125639098763664, 0.2581608251810226, 1.5537032169160463, 0.3275926313287634]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @322, measure = [LogLoss{Float64} @401], measurement = [0.15165116029290174], per_fold = [[3.663735981263026e-15, 3.663735981263026e-15, 0.21923421291254536, 0.2519509365015743, 0.20632317209433484, 0.23239864024894868]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @170, measure = [LogLoss{Float64} @401], measurement = [0.22195725051655812], per_fold = [[3.663735981263026e-15, 3.663735981263026e-15, 0.3538539062038611, 0.3603703217629861, 0.31082453434496743, 0.306694740787527]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @775, measure = [LogLoss{Float64} @401], measurement = [0.45947509787062923], per_fold = [[0.035726840434153855, 0.004665490889876289, 0.36236833491429216, 0.44481758389018333, 1.5634330663897538, 0.345839270705516]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @765, measure = [LogLoss{Float64} @401], measurement = [0.18702291285217096], per_fold = [[0.03420779581473458, 0.006633198911351671, 0.2598741617705459, 0.2936036530674659, 0.285795983948016, 0.24202268360091161]])  …  (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @382, measure = [LogLoss{Float64} @401], measurement = [0.20893268289219202], per_fold = [[0.056106119719187574, 0.022517626156536852, 0.27697854000651817, 0.24308807600476018, 0.34079393852262546, 0.3141117969435239]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @852, measure = [LogLoss{Float64} @401], measurement = [0.2094438315718404], per_fold = [[0.06261211603880004, 0.03440427462945622, 0.30667626994638714, 0.2653545644930015, 0.30151117595693205, 0.28610458836646535]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @766, measure = [LogLoss{Float64} @401], measurement = [0.20431476361040482], per_fold = [[3.663735981263026e-15, 3.663735981263026e-15, 0.28311119117962547, 0.37956926272109615, 0.29259811719829243, 0.27061001056340767]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @955, measure = [LogLoss{Float64} @401], measurement = [0.18565830330475042], per_fold = [[3.663735981263026e-15, 3.663735981263026e-15, 0.2344402658669453, 0.31566326445381376, 0.2801837679847096, 0.2836625215230264]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @376, measure = [LogLoss{Float64} @401], measurement = [0.21494647892730032], per_fold = [[0.061455374986550106, 0.02308488859438163, 0.2858214415947885, 0.2531686554515285, 0.36119273739925745, 0.3049557755372955]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @243, measure = [LogLoss{Float64} @401], measurement = [0.21263271445653065], per_fold = [[0.06878568431988945, 0.02762237914540662, 0.33219267416185194, 0.2615686197325747, 0.32437496585371955, 0.2612519635257415]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @076, measure = [LogLoss{Float64} @401], measurement = [0.21124683706085823], per_fold = [[0.0538927775824108, 0.015985922557126067, 0.28209580914415155, 0.26211814106732684, 0.32563155063829824, 0.3277568213758357]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @939, measure = [LogLoss{Float64} @401], measurement = [0.619314636562445], per_fold = [[3.663735981263026e-15, 3.663735981263026e-15, 0.27648436240974034, 1.5848393371793537, 1.5597684110967414, 0.2947957086888274]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @949, measure = [LogLoss{Float64} @401], measurement = [0.8491055735929129], per_fold = [[3.663735981263026e-15, 3.663735981263026e-15, 1.5756201132972665, 1.612584632208667, 1.575861560443455, 0.3305671356080825]]), (model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @690, measure = [LogLoss{Float64} @401], measurement = [0.18867708324770346], per_fold = [[3.663735981263026e-15, 3.663735981263026e-15, 0.24207448070981413, 0.3339828364731146, 0.28284255440714573, 0.273162627896139]])],
 best_report = (measures = Any[],
                oob_measurements = missing,),
 plotting = (parameter_names = ["bagging_fraction", "atom.n_subfeatures"],
             parameter_scales = [:log10, :linear],
             parameter_values = Any[0.5325205447199813 4; 1.0 4; … ; 0.8815912549960212 3; 0.6433324490047159 4],
             measurements = [0.16313010263421743, 2.423637072586287, 0.21494041076689133, 0.1628785360876341, 0.15947693421163087, 0.4099108325302428, 0.15165116029290174, 0.22195725051655812, 0.45947509787062923, 0.18702291285217096  …  0.20893268289219202, 0.2094438315718404, 0.20431476361040482, 0.18565830330475042, 0.21494647892730032, 0.21263271445653065, 0.21124683706085823, 0.619314636562445, 0.8491055735929129, 0.18867708324770346],),)

Visualizing these results:

using Plots
plot(tuned)

Predicting on new data using the optimized model:

predict(tuned, Xnew)
3-element Array{UnivariateFinite{Multiclass{3},String,UInt32,Float64},1}:
 UnivariateFinite{Multiclass{3}}(versicolor=>0.21, virginica=>0.0233, setosa=>0.767)
 UnivariateFinite{Multiclass{3}}(versicolor=>0.113, virginica=>0.0167, setosa=>0.87)
 UnivariateFinite{Multiclass{3}}(versicolor=>0.0, virginica=>0.0, setosa=>1.0)

Constructing a linear pipeline

Reference: Composing Models

Constructing a linear (unbranching) pipeline with a learned target transformation/inverse transformation:

X, y = @load_reduced_ames
@load KNNRegressor
pipe = @pipeline(X -> coerce(X, :age=>Continuous),
                 OneHotEncoder,
                 KNNRegressor(K=3),
                 target = UnivariateStandardizer)
Pipeline259(
    one_hot_encoder = OneHotEncoder(
            features = Symbol[],
            drop_last = false,
            ordered_factor = true,
            ignore = false),
    knn_regressor = KNNRegressor(
            K = 3,
            algorithm = :kdtree,
            metric = Distances.Euclidean(0.0),
            leafsize = 10,
            reorder = true,
            weights = :uniform),
    target = UnivariateStandardizer()) @839

Evaluating the pipeline (just as you would any other model):

pipe.knn_regressor.K = 2
pipe.one_hot_encoder.drop_last = true
evaluate(pipe, X, y, resampling=Holdout(), measure=rms, verbosity=2)
┌───────────────────────────┬───────────────┬────────────┐
│ _.measure                 │ _.measurement │ _.per_fold │
├───────────────────────────┼───────────────┼────────────┤
│ RootMeanSquaredError @712 │ 53100.0       │ [53100.0]  │
└───────────────────────────┴───────────────┴────────────┘
_.per_observation = [missing]
_.fitted_params_per_fold = [ … ]
_.report_per_fold = [ … ]

Inspecting the learned parameters in a pipeline:

mach = machine(pipe, X, y) |> fit!
F = fitted_params(mach)
F.one_hot_encoder
(fitresult = OneHotEncoderResult @043,)

Constructing a linear (unbranching) pipeline with a static (unlearned) target transformation/inverse transformation:

@load DecisionTreeRegressor
pipe2 = @pipeline(X -> coerce(X, :age=>Continuous),
                  OneHotEncoder,
                  DecisionTreeRegressor(max_depth=4),
                  target = y -> log.(y),
                  inverse = z -> exp.(z))
Pipeline270(
    one_hot_encoder = OneHotEncoder(
            features = Symbol[],
            drop_last = false,
            ordered_factor = true,
            ignore = false),
    decision_tree_regressor = DecisionTreeRegressor(
            max_depth = 4,
            min_samples_leaf = 5,
            min_samples_split = 2,
            min_purity_increase = 0.0,
            n_subfeatures = 0,
            post_prune = false,
            merge_purity_threshold = 1.0),
    target = WrappedFunction(
            f = Main.ex-workflows.var"#28#29"()),
    inverse = WrappedFunction(
            f = Main.ex-workflows.var"#30#31"())) @357

Creating a homogeneous ensemble of models

Reference: Homogeneous Ensembles

X, y = @load_iris
tree_model = @load DecisionTreeClassifier
forest_model = EnsembleModel(atom=tree_model, bagging_fraction=0.8, n=300)
forest = machine(forest_model, X, y)
evaluate!(forest, measure=cross_entropy)
┌───────────────────────┬───────────────┬───────────────────────────────────────
│ _.measure             │ _.measurement │ _.per_fold                           ⋯
├───────────────────────┼───────────────┼───────────────────────────────────────
│ LogLoss{Float64} @401 │ 0.432         │ [3.66e-15, 3.66e-15, 0.317, 1.62, 0. ⋯
└───────────────────────┴───────────────┴───────────────────────────────────────
                                                                1 column omitted
_.per_observation = [[[3.66e-15, 3.66e-15, ..., 3.66e-15], [3.66e-15, 3.66e-15, ..., 3.66e-15], [0.0513, 0.00334, ..., 3.66e-15], [3.66e-15, 0.174, ..., 3.66e-15], [3.66e-15, 0.0202, ..., 3.66e-15], [0.0168, 0.426, ..., 0.0305]]]
_.fitted_params_per_fold = [ … ]
_.report_per_fold = [ … ]

Performance curves

Generate a plot of performance, as a function of some hyperparameter (building on the preceding example)

Single performance curve:

r = range(forest_model, :n, lower=1, upper=1000, scale=:log10)
curve = learning_curve(forest,
                            range=r,
                            resampling=Holdout(),
                            resolution=50,
                            measure=cross_entropy,
                            verbosity=0)
(parameter_name = "n",
 parameter_scale = :log10,
 parameter_values = [1, 2, 3, 4, 5, 6, 7, 8, 10, 11  …  281, 324, 373, 429, 494, 569, 655, 754, 869, 1000],
 measurements = [9.611640903764574, 8.148330189249133, 6.568487378319261, 6.608003121338145, 6.568670431262127, 6.568487378319261, 6.588147984138892, 5.818531638968441, 5.856147377305975, 5.876579548959426  …  1.2331511530565782, 1.2345247213127193, 1.2280648272563384, 1.2271911498204102, 1.22458893917901, 1.234899008861542, 1.2325916555879235, 1.235970144808085, 1.2430204992556089, 1.2424851628217406],)
using Plots
plot(curve.parameter_values, curve.measurements, xlab=curve.parameter_name, xscale=curve.parameter_scale)

Multiple curves:

curve = learning_curve(forest,
                       range=r,
                       resampling=Holdout(),
                       measure=cross_entropy,
                       resolution=50,
                       rng_name=:rng,
                       rngs=4,
                       verbosity=0)
(parameter_name = "n",
 parameter_scale = :log10,
 parameter_values = [1, 2, 3, 4, 5, 6, 7, 8, 10, 11  …  281, 324, 373, 429, 494, 569, 655, 754, 869, 1000],
 measurements = [8.009700753137146 7.20873067782343 15.218431430960575 4.004850376568572; 8.040507294495367 7.20873067782343 15.218431430960575 4.004850376568572; … ; 1.2579739653824205 1.230501052803986 1.2428295907404219 1.2330164438810343; 1.2608201975567264 1.2297702745207544 1.243331635132298 1.2335448172875985],)
plot(curve.parameter_values, curve.measurements,
xlab=curve.parameter_name, xscale=curve.parameter_scale)