Common MLJ Workflows

Common MLJ Workflows

Data ingestion

using RDatasets
channing = dataset("boot", "channing")
first(channing, 4)

4 rows × 5 columns

SexEntryExitTimeCens
Categorical…Int32Int32Int32Int32
1Male7829091271
2Male102011281081
3Male8569691131
4Male915957421

Inspecting metadata, including column scientific types:

schema(channing)
_.table = 
┌─────────┬────────────────────────────────────────────┬───────────────┐
│ _.names │ _.types                                    │ _.scitypes    │
├─────────┼────────────────────────────────────────────┼───────────────┤
│ Sex     │ CategoricalArrays.CategoricalString{UInt8} │ Multiclass{2} │
│ Entry   │ Int32                                      │ Count         │
│ Exit    │ Int32                                      │ Count         │
│ Time    │ Int32                                      │ Count         │
│ Cens    │ Int32                                      │ Count         │
└─────────┴────────────────────────────────────────────┴───────────────┘
_.nrows = 462

Unpacking data and correcting for wrong scitypes:

y, X =  unpack(channing,
               ==(:Exit),            # y is the :Exit column
               !=(:Time);            # X is the rest, except :Time
               :Exit=>Continuous,
               :Entry=>Continuous,
               :Cens=>Multiclass)
first(X, 4)

4 rows × 3 columns

SexEntryCens
Categorical…Float64Categorical…
1Male782.01
2Male1020.01
3Male856.01
4Male915.01

Note: Before julia 1.2, replace !=(:Time) with col -> col != :Time.

y[1:4]
4-element Array{Float64,1}:
  909.0
 1128.0
  969.0
  957.0

Loading a built-in supervised dataset:

X, y = @load_iris;
selectrows(X, 1:4) # selectrows works for any Tables.jl table
(sepal_length = [5.1, 4.9, 4.7, 4.6],
 sepal_width = [3.5, 3.0, 3.2, 3.1],
 petal_length = [1.4, 1.4, 1.3, 1.5],
 petal_width = [0.2, 0.2, 0.2, 0.2],)
y[1:4]
4-element CategoricalArrays.CategoricalArray{String,1,UInt8}:
 "setosa"
 "setosa"
 "setosa"
 "setosa"

Model search (experimental)

Reference: Model Search

Searching for a supervised model:

X, y = @load_boston
models(matching(X, y))
48-element Array{NamedTuple,1}:
 (name = ARDRegressor, package_name = ScikitLearn, ... )                
 (name = AdaBoostRegressor, package_name = ScikitLearn, ... )           
 (name = BaggingRegressor, package_name = ScikitLearn, ... )            
 (name = BayesianRidgeRegressor, package_name = ScikitLearn, ... )      
 (name = ConstantRegressor, package_name = MLJModels, ... )             
 (name = DecisionTreeRegressor, package_name = DecisionTree, ... )      
 (name = DeterministicConstantRegressor, package_name = MLJModels, ... )
 (name = DummyRegressor, package_name = ScikitLearn, ... )              
 (name = ElasticNetCVRegressor, package_name = ScikitLearn, ... )       
 (name = ElasticNetRegressor, package_name = MLJLinearModels, ... )     
 ⋮                                                                      
 (name = RidgeRegressor, package_name = MultivariateStats, ... )        
 (name = RidgeRegressor, package_name = ScikitLearn, ... )              
 (name = RobustRegressor, package_name = MLJLinearModels, ... )         
 (name = SGDRegressor, package_name = ScikitLearn, ... )                
 (name = SVMLRegressor, package_name = ScikitLearn, ... )               
 (name = SVMNuRegressor, package_name = ScikitLearn, ... )              
 (name = SVMRegressor, package_name = ScikitLearn, ... )                
 (name = TheilSenRegressor, package_name = ScikitLearn, ... )           
 (name = XGBoostRegressor, package_name = XGBoost, ... )                
models(matching(X, y))[6]
Decision Tree Regressor.
→ based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl).
→ do `@load DecisionTreeRegressor pkg="DecisionTree"` to use the model.
→ do `?DecisionTreeRegressor` for documentation.
(name = "DecisionTreeRegressor",
 package_name = "DecisionTree",
 is_supervised = true,
 docstring = "Decision Tree Regressor.\n→ based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl).\n→ do `@load DecisionTreeRegressor pkg=\"DecisionTree\"` to use the model.\n→ do `?DecisionTreeRegressor` for documentation.",
 hyperparameter_types = ["Float64", "Int64", "Int64", "Int64", "Float64", "Int64", "Bool"],
 hyperparameters = Symbol[:pruning_purity_threshold, :max_depth, :min_samples_leaf, :min_samples_split, :min_purity_increase, :n_subfeatures, :post_prune],
 implemented_methods = Symbol[:fit, :predict, :fitted_params],
 is_pure_julia = true,
 is_wrapper = false,
 load_path = "MLJModels.DecisionTree_.DecisionTreeRegressor",
 package_license = "MIT",
 package_url = "https://github.com/bensadeghi/DecisionTree.jl",
 package_uuid = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb",
 prediction_type = :deterministic,
 supports_online = false,
 supports_weights = false,
 input_scitype = Table{_s13} where _s13<:Union{AbstractArray{_s12,1} where _s12<:Continuous, AbstractArray{_s12,1} where _s12<:Count, AbstractArray{_s12,1} where _s12<:OrderedFactor},
 target_scitype = AbstractArray{Continuous,1},)

More refined searches:

models() do model
    matching(model, X, y) &&
    model.prediction_type == :deterministic &&
    model.is_pure_julia
end
12-element Array{NamedTuple,1}:
 (name = DecisionTreeRegressor, package_name = DecisionTree, ... )      
 (name = DeterministicConstantRegressor, package_name = MLJModels, ... )
 (name = ElasticNetRegressor, package_name = MLJLinearModels, ... )     
 (name = HuberRegressor, package_name = MLJLinearModels, ... )          
 (name = KNNRegressor, package_name = NearestNeighbors, ... )           
 (name = LADRegressor, package_name = MLJLinearModels, ... )            
 (name = LassoRegressor, package_name = MLJLinearModels, ... )          
 (name = LinearRegressor, package_name = MLJLinearModels, ... )         
 (name = QuantileRegressor, package_name = MLJLinearModels, ... )       
 (name = RidgeRegressor, package_name = MLJLinearModels, ... )          
 (name = RidgeRegressor, package_name = MultivariateStats, ... )        
 (name = RobustRegressor, package_name = MLJLinearModels, ... )         

Searching for an unsupervised model:

models(matching(X))
11-element Array{NamedTuple,1}:
 (name = FeatureSelector, package_name = MLJModels, ... )  
 (name = FillImputer, package_name = MLJModels, ... )      
 (name = ICA, package_name = MultivariateStats, ... )      
 (name = KMeans, package_name = Clustering, ... )          
 (name = KMedoids, package_name = Clustering, ... )        
 (name = KernelPCA, package_name = MultivariateStats, ... )
 (name = OneClassSVM, package_name = LIBSVM, ... )         
 (name = OneHotEncoder, package_name = MLJModels, ... )    
 (name = PCA, package_name = MultivariateStats, ... )      
 (name = Standardizer, package_name = MLJModels, ... )     
 (name = StaticTransformer, package_name = MLJBase, ... )  

Getting the metadata entry for a given model type:

info("PCA")
info("RidgeRegressor", pkg="MultivariateStats") # a model type in multiple packages
Ridge regressor with regularization parameter lambda. Learns a linear regression with a penalty on the l2 norm of the coefficients.
→ based on [MultivariateStats](https://github.com/JuliaStats/MultivariateStats.jl).
→ do `@load RidgeRegressor pkg="MultivariateStats"` to use the model.
→ do `?RidgeRegressor` for documentation.
(name = "RidgeRegressor",
 package_name = "MultivariateStats",
 is_supervised = true,
 docstring = "Ridge regressor with regularization parameter lambda. Learns a linear regression with a penalty on the l2 norm of the coefficients.\n→ based on [MultivariateStats](https://github.com/JuliaStats/MultivariateStats.jl).\n→ do `@load RidgeRegressor pkg=\"MultivariateStats\"` to use the model.\n→ do `?RidgeRegressor` for documentation.",
 hyperparameter_types = ["Real"],
 hyperparameters = Symbol[:lambda],
 implemented_methods = Symbol[:fit, :predict, :fitted_params],
 is_pure_julia = true,
 is_wrapper = false,
 load_path = "MLJModels.MultivariateStats_.RidgeRegressor",
 package_license = "MIT",
 package_url = "https://github.com/JuliaStats/MultivariateStats.jl",
 package_uuid = "6f286f6a-111f-5878-ab1e-185364afe411",
 prediction_type = :deterministic,
 supports_online = false,
 supports_weights = false,
 input_scitype = Table{_s13} where _s13<:(AbstractArray{_s12,1} where _s12<:Continuous),
 target_scitype = AbstractArray{Continuous,1},)

Instantiating a model

Reference: Getting Started

@load DecisionTreeClassifier
model = DecisionTreeClassifier(min_samples_split=5, max_depth=4)
MLJModels.DecisionTree_.DecisionTreeClassifier(pruning_purity = 1.0,
                                               max_depth = 4,
                                               min_samples_leaf = 1,
                                               min_samples_split = 5,
                                               min_purity_increase = 0.0,
                                               n_subfeatures = 0,
                                               display_depth = 5,
                                               post_prune = false,
                                               merge_purity_threshold = 0.9,
                                               pdf_smoothing = 0.05,) @ 1…44

or

model = @load DecisionTreeClassifier
model.min_samples_split = 5
model.max_depth = 4

Evaluating a model

Reference: Evaluating Model Performance

X, y = @load_boston
model = @load KNNRegressor
evaluate(model, X, y, resampling=CV(nfolds=5), measure=[rms, mav])
(measure = MLJBase.Measure[rms, mav],
 measurement = [8.819961396135172, 6.07135313531353],
 per_fold = Array{Float64,1}[[8.525465870955774, 8.52461967445231, 10.74455588603451, 9.393386761519249, 6.318598752577854], [6.489306930693069, 5.434059405940592, 7.613069306930692, 6.033663366336635, 4.786666666666665]],
 per_observation = Missing[missing, missing],)

Basic fit/evaluate/predict by hand:

Reference: Getting Started, Machines, Evaluating Model Performance, Performance Measures

using RDatasets
vaso = dataset("robustbase", "vaso"); # a DataFrame
first(vaso, 3)

3 rows × 3 columns

VolumeRateY
Float64Float64Int64
13.70.8251
23.51.091
31.252.51
y, X = unpack(vaso, ==(:Y), c -> true; :Y => Multiclass)

tree_model = @load DecisionTreeClassifier
┌ Info: A model type "DecisionTreeClassifier" is already loaded.
└ No new code loaded.

Bind the model and data together in a machine , which will additionally store the learned parameters (fitresults) when fit:

tree = machine(tree_model, X, y)
Machine{DecisionTreeClassifier} @ 1…73

Split row indices into training and evaluation rows:

train, test = partition(eachindex(y), 0.7, shuffle=true, rng=1234); # 70:30 split
([27, 28, 30, 31, 32, 18, 21, 9, 26, 14  …  7, 39, 2, 37, 1, 8, 19, 25, 35, 34], [22, 13, 11, 4, 10, 16, 3, 20, 29, 23, 12, 24])

Fit on train and evaluate on test:

fit!(tree, rows=train)
yhat = predict(tree, rows=test);
mean(cross_entropy(yhat, y[test]))
1.135369212298553

Predict on new data:

Xnew = (Volume=3*rand(3), Rate=3*rand(3))
predict(tree, Xnew)      # a vector of distributions
3-element Array{UnivariateFinite{Int64,UInt8,Float64},1}:
 UnivariateFinite(0=>0.0244, 1=>0.976)
 UnivariateFinite(0=>0.0244, 1=>0.976)
 UnivariateFinite(0=>0.9, 1=>0.1)     
predict_mode(tree, Xnew) # a vector of point-predictions
3-element CategoricalArrays.CategoricalArray{Int64,1,UInt8}:
 1
 1
 0

More performance evaluation examples

import LossFunctions.ZeroOneLoss

Evaluating model + data directly:

evaluate(tree_model, X, y,
         resampling=Holdout(fraction_train=0.7, shuffle=true, rng=1234),
         measure=[cross_entropy, ZeroOneLoss()])
(measure = Any[cross_entropy, LossFunctions.ZeroOneLoss()],
 measurement = [1.135369212298553, 0.4166666666666667],
 per_fold = Array{Float64,1}[[1.135369212298553], [0.4166666666666667]],
 per_observation = Array{Array{Float64,1},1}[[[0.10536051565782628, 3.7135720667043075, 0.10536051565782628, 2.3025850929940455, 0.10536051565782628, 0.3184537311185346, 0.02469261259037141, 0.3184537311185346, 0.3184537311185346, 1.2992829841302609, 3.7135720667043075, 1.2992829841302609]], [[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0]]],)

If a machine is already defined, as above:

evaluate!(tree,
          resampling=Holdout(fraction_train=0.7, shuffle=true, rng=1234),
          measure=[cross_entropy, ZeroOneLoss()])
(measure = Any[cross_entropy, LossFunctions.ZeroOneLoss()],
 measurement = [1.135369212298553, 0.4166666666666667],
 per_fold = Array{Float64,1}[[1.135369212298553], [0.4166666666666667]],
 per_observation = Array{Array{Float64,1},1}[[[0.10536051565782628, 3.7135720667043075, 0.10536051565782628, 2.3025850929940455, 0.10536051565782628, 0.3184537311185346, 0.02469261259037141, 0.3184537311185346, 0.3184537311185346, 1.2992829841302609, 3.7135720667043075, 1.2992829841302609]], [[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0]]],)

Using cross-validation:

evaluate!(tree, resampling=CV(nfolds=5, shuffle=true, rng=1234),
          measure=[cross_entropy, ZeroOneLoss()])
(measure = Any[cross_entropy, LossFunctions.ZeroOneLoss()],
 measurement = [0.8952054505541888, 0.37662337662337664],
 per_fold = Array{Float64,1}[[1.3414493126944902, 0.6793778736204937, 0.502160161140067, 0.7171963107684262, 1.2358435945474666], [0.5714285714285714, 0.42857142857142855, 0.0, 0.42857142857142855, 0.45454545454545453]],
 per_observation = Array{Array{Float64,1},1}[[[0.02469261259037141, 0.9444616088408514, 0.02469261259037141, 0.02469261259037141, 3.7135720667043075, 3.7135720667043075, 0.9444616088408514], [1.2321436812926323, 0.02469261259037141, 0.3448404862917295, 1.2321436812926323, 0.3448404862917295, 1.2321436812926323, 0.3448404862917295], [0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.02469261259037141, 0.6931471805599453, 0.02469261259037141, 0.6931471805599453], [0.3629054936893685, 1.1895840668738362, 1.1895840668738362, 0.3629054936893685, 0.3629054936893685, 0.3629054936893685, 1.1895840668738362], [3.7135720667043075, 0.0953101798043249, 2.3978952727983707, 0.0953101798043249, 0.3184537311185346, 0.02469261259037141, 0.3184537311185346, 0.3184537311185346, 1.2992829841302609, 3.7135720667043075, 1.2992829841302609]], [[0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0], [1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0], [1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0]]],)

With user-specified train/test pairs of row indices:

f1, f2, f3 = 1:13, 14:26, 27:36
pairs = [(f1, vcat(f2, f3)), (f2, vcat(f3, f1)), (f3, vcat(f1, f2))];
evaluate!(tree,
          resampling=pairs,
          measure=[cross_entropy, ZeroOneLoss()])
(measure = Any[cross_entropy, LossFunctions.ZeroOneLoss()],
 measurement = [0.895254695800462, 0.24136008918617616],
 per_fold = Array{Float64,1}[[0.7538091986662944, 1.1473950551467866, 0.7845598335883047], [0.30434782608695654, 0.30434782608695654, 0.11538461538461539]],
 per_observation = Array{Array{Float64,1},1}[[[0.15415067982725836, 0.15415067982725836, 0.15415067982725836, 0.15415067982725836, 0.15415067982725836, 1.9459101490553135, 0.15415067982725836, 0.02469261259037141, 1.9459101490553135, 1.9459101490553135  …  0.15415067982725836, 1.9459101490553135, 0.15415067982725836, 0.02469261259037141, 3.7135720667043075, 0.02469261259037141, 1.9459101490553135, 0.15415067982725836, 0.15415067982725836, 0.15415067982725836], [0.02469261259037141, 3.7135720667043075, 3.7135720667043075, 0.02469261259037141, 3.7135720667043075, 0.02469261259037141, 3.7135720667043075, 0.02469261259037141, 0.02469261259037141, 0.02469261259037141  …  0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 3.7135720667043075, 0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 0.02469261259037141], [0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.02469261259037141  …  0.02469261259037141, 0.6931471805599453, 3.7135720667043075, 0.02469261259037141, 0.6931471805599453, 0.6931471805599453, 3.7135720667043075, 3.7135720667043075, 0.02469261259037141, 0.6931471805599453]], [[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0  …  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]]],)

Changing a hyperparameter and re-evaluating:

tree_model.max_depth = 3
evaluate!(tree,
          resampling=CV(nfolds=5, shuffle=true, rng=1234),
          measure=[cross_entropy, ZeroOneLoss()])
(measure = Any[cross_entropy, LossFunctions.ZeroOneLoss()],
 measurement = [0.9428720309210789, 0.37662337662337664],
 per_fold = Array{Float64,1}[[1.2639529176789173, 0.5389359017582559, 0.39569399232593006, 1.0168163229106142, 1.498961019931677], [0.42857142857142855, 0.42857142857142855, 0.14285714285714285, 0.42857142857142855, 0.45454545454545453]],
 per_observation = Array{Array{Float64,1},1}[[[0.02469261259037141, 1.3217558399823195, 0.02469261259037141, 0.02469261259037141, 3.7135720667043075, 3.7135720667043075, 0.02469261259037141], [0.8873031950009028, 0.02469261259037141, 0.5306282510621704, 0.8873031950009028, 0.5306282510621704, 0.8873031950009028, 0.02469261259037141], [0.40546510810816444, 0.40546510810816444, 0.40546510810816444, 0.02469261259037141, 0.40546510810816444, 0.02469261259037141, 1.0986122886681098], [0.02469261259037141, 0.8266785731844679, 3.7135720667043075, 0.5753641449035618, 0.5753641449035618, 0.5753641449035618, 0.8266785731844679], [3.7135720667043075, 0.2876820724517809, 3.7135720667043075, 0.2876820724517809, 0.11778303565638351, 0.02469261259037141, 0.11778303565638351, 0.11778303565638351, 2.1972245773362196, 3.7135720667043075, 2.1972245773362196]], [[0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0], [1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0], [1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0]]],)

Inspecting training results

Fit a ordinary least square model to some synthetic data:

x1 = rand(100)
x2 = rand(100)

X = (x1=x1, x2=x2)
y = x1 - 2x2 + 0.1*rand(100);

ols_model = @load LinearRegressor pkg=GLM
ols =  machine(ols_model, X, y)
fit!(ols)
Machine{LinearRegressor} @ 9…30

Get a named tuple representing the learned parameters, human-readable if appropriate:

fitted_params(ols)
(coef = [0.992013999126179, -1.9746314749363978],
 intercept = 0.04228349567790817,)

Get other training-related information:

report(ols)
(deviance = 0.08516363110779646,
 dof_residual = 97.0,
 stderror = [0.01140189967384003, 0.010799515922194679, 0.008204103143485965],
 vcov = [0.00013000331617231338 6.876975495565453e-6 -6.385237241869661e-5; 6.876975495565453e-6 0.0001166295441537364 -5.9577853863531925e-5; -6.385237241869661e-5 -5.9577853863531925e-5 6.730730838895631e-5],)

Basic fit/transform for unsupervised models

Load data:

X, y = @load_iris
train, test = partition(eachindex(y), 0.97, shuffle=true, rng=123)
([125, 100, 130, 9, 70, 148, 39, 64, 6, 107  …  110, 59, 139, 21, 112, 144, 140, 72, 109, 41], [106, 147, 47, 5])

Instantiate and fit the model/machine:

@load PCA
pca_model = PCA(maxoutdim=2)
pca = machine(pca_model, X)
fit!(pca, rows=train)
Machine{PCA} @ 1…52

Transform selected data bound to the machine:

transform(pca, rows=test);
(x1 = [-3.3942826854483243, -1.5219827578765068, 2.538247455185219, 2.7299639893931373],
 x2 = [0.5472450223745241, -0.36842368617126214, 0.5199299511335698, 0.3448466122232363],)

Transform new data:

Xnew = (sepal_length=rand(3), sepal_width=rand(3),
        petal_length=rand(3), petal_width=rand(3));
transform(pca, Xnew)
(x1 = [4.376819837057599, 4.930541606755344, 5.04777923947442],
 x2 = [-4.738508460340975, -4.488475105573376, -4.80683862498002],)

Inverting learned transformations

y = rand(100);
stand_model = UnivariateStandardizer()
stand = machine(stand_model, y)
fit!(stand)
z = transform(stand, y);
@assert inverse_transform(stand, z) ≈ y # true
[ Info: Training Machine{UnivariateStandardizer} @ 1…10.

Nested hyperparameter tuning

Reference: Tuning Models

Define a model with nested hyperparameters:

tree_model = @load DecisionTreeClassifier
forest_model = EnsembleModel(atom=tree_model, n=300)
MLJ.ProbabilisticEnsembleModel(atom = MLJModels.DecisionTree_.DecisionTreeClassifier(pruning_purity = 1.0,
                                                                                     max_depth = -1,
                                                                                     min_samples_leaf = 1,
                                                                                     min_samples_split = 2,
                                                                                     min_purity_increase = 0.0,
                                                                                     n_subfeatures = 0,
                                                                                     display_depth = 5,
                                                                                     post_prune = false,
                                                                                     merge_purity_threshold = 0.9,
                                                                                     pdf_smoothing = 0.05,),
                               atomic_weights = Float64[],
                               bagging_fraction = 0.8,
                               rng = MersenneTwister(UInt32[0x026ce58d, 0xdedad331, 0xee6917e9, 0xcb3e2c68]) @ 113,
                               n = 300,
                               acceleration = ComputationalResources.CPU1{Nothing}(nothing),
                               out_of_bag_measure = Any[],) @ 1…50

Inspect all hyperparameters, even nested ones (returns nested named tuple):

params(forest_model)
(atom = (pruning_purity = 1.0,
         max_depth = -1,
         min_samples_leaf = 1,
         min_samples_split = 2,
         min_purity_increase = 0.0,
         n_subfeatures = 0,
         display_depth = 5,
         post_prune = false,
         merge_purity_threshold = 0.9,
         pdf_smoothing = 0.05,),
 atomic_weights = Float64[],
 bagging_fraction = 0.8,
 rng = MersenneTwister(UInt32[0x026ce58d, 0xdedad331, 0xee6917e9, 0xcb3e2c68]) @ 113,
 n = 300,
 acceleration = ComputationalResources.CPU1{Nothing}(nothing),
 out_of_bag_measure = Any[],)

Define ranges for hyperparameters to be tuned:

r1 = range(forest_model, :bagging_fraction, lower=0.5, upper=1.0, scale=:log10)
MLJ.NumericRange(field = :bagging_fraction,
                 lower = 0.5,
                 upper = 1.0,
                 scale = :log10,) @ 4…78
r2 = range(forest_model, :(atom.n_subfeatures), lower=1, upper=4) # nested
MLJ.NumericRange(field = :(atom.n_subfeatures),
                 lower = 1,
                 upper = 4,
                 scale = :linear,) @ 1…68

Wrap the model in a tuning strategy:

tuned_forest = TunedModel(model=forest_model,
                          tuning=Grid(resolution=12),
                          resampling=CV(nfolds=6),
                          ranges=[r1, r2],
                          measure=cross_entropy)
MLJ.ProbabilisticTunedModel(model = MLJ.ProbabilisticEnsembleModel(atom = DecisionTreeClassifier @ 9…24,
                                                                   atomic_weights = Float64[],
                                                                   bagging_fraction = 0.8,
                                                                   rng = MersenneTwister(UInt32[0x026ce58d, 0xdedad331, 0xee6917e9, 0xcb3e2c68]) @ 113,
                                                                   n = 300,
                                                                   acceleration = ComputationalResources.CPU1{Nothing}(nothing),
                                                                   out_of_bag_measure = Any[],),
                            tuning = Grid(resolution = 12,
                                          acceleration = ComputationalResources.CPU1{Nothing}(nothing),),
                            resampling = CV(nfolds = 6,
                                            shuffle = false,
                                            rng = MersenneTwister(UInt32[0x026ce58d, 0xdedad331, 0xee6917e9, 0xcb3e2c68]) @ 113,),
                            measure = MLJBase.CrossEntropy(),
                            weights = nothing,
                            operation = StatsBase.predict,
                            ranges = MLJ.NumericRange{T,Symbol} where T[NumericRange @ 4…78, NumericRange @ 1…68],
                            full_report = true,
                            train_best = true,) @ 1…40

Bound the wrapped model to data:

tuned = machine(tuned_forest, X, y)
Machine{ProbabilisticTunedModel} @ 4…48

Fitting the resultant machine optimizes the hyperaparameters specified in range, using the specified tuning and resampling strategies and performance measure (possibly a vector of measures), and retrains on all data bound to the machine:

fit!(tuned)
Machine{ProbabilisticTunedModel} @ 4…48

Inspecting the optimal model:

F = fitted_params(tuned)
(best_model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 8…95,
 best_fitted_params = (fitresult = WrappedEnsemble @ 2…27,),)
F.best_model
MLJ.ProbabilisticEnsembleModel(atom = MLJModels.DecisionTree_.DecisionTreeClassifier(pruning_purity = 1.0,
                                                                                     max_depth = -1,
                                                                                     min_samples_leaf = 1,
                                                                                     min_samples_split = 2,
                                                                                     min_purity_increase = 0.0,
                                                                                     n_subfeatures = 3,
                                                                                     display_depth = 5,
                                                                                     post_prune = false,
                                                                                     merge_purity_threshold = 0.9,
                                                                                     pdf_smoothing = 0.05,),
                               atomic_weights = Float64[],
                               bagging_fraction = 0.5,
                               rng = MersenneTwister(UInt32[0x026ce58d, 0xdedad331, 0xee6917e9, 0xcb3e2c68]) @ 661,
                               n = 300,
                               acceleration = ComputationalResources.CPU1{Nothing}(nothing),
                               out_of_bag_measure = Any[],) @ 8…95

Inspecting details of tuning procedure:

report(tuned)
(parameter_names = ["bagging_fraction" "atom.n_subfeatures"],
 parameter_scales = Symbol[:log10 :linear],
 best_measurement = 0.1767244188426702,
 best_report = (measures = Any[],
                oob_measurements = missing,),
 parameter_values = Any[0.5 1; 0.5325205447199813 1; … ; 0.9389309106617063 4; 1.0 4],
 measurements = [0.2430214868035755, 0.23931782231853957, 0.23295319322842825, 0.22745888680421453, 0.22792179726142803, 0.2374858254477894, 0.23081888767038572, 0.227816421282213, 0.22211641977662325, 0.22099906529011024  …  0.18983352007263385, 0.18846513183605337, 0.1960367039510861, 0.20273574810151285, 0.2054643003726404, 0.21927905750370957, 0.22104177933372812, 0.23891126602418652, 0.2545481220282691, 0.3253427761652838],)

Visualizing these results:

using Plots
plot(tuned)

Predicting on new data using the optimized model:

predict(tuned, Xnew)
3-element Array{UnivariateFinite{String,UInt8,Float64},1}:
 UnivariateFinite(setosa=>0.968, versicolor=>0.0161, virginica=>0.0161)
 UnivariateFinite(setosa=>0.968, versicolor=>0.0161, virginica=>0.0161)
 UnivariateFinite(setosa=>0.968, versicolor=>0.0161, virginica=>0.0161)

Constructing a linear pipeline

Reference: Composing Models

Constructing a linear (unbranching) pipeline with a learned target transformation/inverse transformation:

X, y = @load_reduced_ames
@load KNNRegressor
pipe = @pipeline MyPipe(X -> coerce(X, :age=>Continuous),
                               hot = OneHotEncoder(),
                               knn = KNNRegressor(K=3),
                               target = UnivariateStandardizer())
Main.ex-workflows.MyPipe(hot = OneHotEncoder(features = Symbol[],
                                             drop_last = false,
                                             ordered_factor = true,),
                         knn = MLJModels.NearestNeighbors_.KNNRegressor(K = 3,
                                                                        algorithm = :kdtree,
                                                                        metric = Distances.Euclidean(0.0),
                                                                        leafsize = 10,
                                                                        reorder = true,
                                                                        weights = :uniform,),
                         target = UnivariateStandardizer(),) @ 1…95

Evaluating the pipeline (just as you would any other model):

pipe.knn.K = 2
pipe.hot.drop_last = true
evaluate(pipe, X, y, resampling=Holdout(), measure=rms, verbosity=2)
(measure = MLJBase.RMS[rms],
 measurement = [53136.24281527115],
 per_fold = Array{Float64,1}[[53136.24281527115]],
 per_observation = Missing[missing],)

Constructing a linear (unbranching) pipeline with a static (unlearned) target transformation/inverse transformation:

@load DecisionTreeRegressor
pipe2 = @pipeline MyPipe2(X -> coerce(X, :age=>Continuous),
                               hot = OneHotEncoder(),
                               tree = DecisionTreeRegressor(max_depth=4),
                               target = y -> log.(y),
                               inverse = z -> exp.(z))
Main.ex-workflows.MyPipe2(hot = OneHotEncoder(features = Symbol[],
                                              drop_last = false,
                                              ordered_factor = true,),
                          tree = MLJModels.DecisionTree_.DecisionTreeRegressor(pruning_purity_threshold = 0.0,
                                                                               max_depth = 4,
                                                                               min_samples_leaf = 5,
                                                                               min_samples_split = 2,
                                                                               min_purity_increase = 0.0,
                                                                               n_subfeatures = 0,
                                                                               post_prune = false,),
                          target = MLJBase.StaticTransformer(f = getfield(Main.ex-workflows, Symbol("##24#25"))(),),
                          inverse = MLJBase.StaticTransformer(f = getfield(Main.ex-workflows, Symbol("##26#27"))(),),) @ 1…93

Creating a homogeneous ensemble of models

Reference: Homogeneous Ensembles

X, y = @load_iris
tree_model = @load DecisionTreeClassifier
forest_model = EnsembleModel(atom=tree_model, bagging_fraction=0.8, n=300)
forest = machine(forest_model, X, y)
evaluate!(forest, measure=cross_entropy)
(measure = MLJBase.CrossEntropy[cross_entropy],
 measurement = [0.2233831542518316],
 per_fold = Array{Float64,1}[[0.032789822822996806, 0.032789822822996806, 0.29847620301962974, 0.34056444818426873, 0.31791444474891384, 0.31776418391218386]],
 per_observation = Array{Array{Float64,1},1}[[[0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806  …  0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806], [0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806  …  0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806], [0.06611682693906369, 0.036072984281914154, 0.6652600953517473, 0.036072984281914154, 0.10410682157403994, 0.052652451919248416, 0.07291761391377606, 0.40586841523200523, 0.032789822822996806, 0.032789822822996806  …  0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.3593667356044473, 0.032789822822996806, 3.546782698630493, 0.032789822822996806, 1.5036742522434936, 0.09012472282394686, 0.032789822822996806], [0.032789822822996806, 0.2640915618386687, 4.127134385045096, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 3.347809508244096, 0.032789822822996806  …  0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.036072984281914154, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806, 0.032789822822996806], [0.03278982282299715, 0.06273369919606898, 0.03278982282299715, 0.03278982282299715, 0.03278982282299715, 0.03278982282299715, 4.127134385045096, 0.03278982282299715, 0.049314495155294784, 0.03278982282299715  …  0.03278982282299715, 0.03278982282299715, 0.03278982282299715, 0.0360729842819145, 2.3166622068848786, 0.03278982282299715, 0.2682309019476121, 0.03278982282299715, 0.08665953273572746, 0.03278982282299715], [0.056001587964934534, 0.3730916050247462, 0.8111318494461643, 0.03278982282299715, 2.0377425125120823, 0.03278982282299715, 0.03278982282299715, 0.03278982282299715, 1.1574039140300967, 1.1374201838166589  …  0.03278982282299715, 0.03607298428191439, 0.03278982282299715, 0.03278982282299715, 0.03278982282299715, 0.03607298428191439, 0.05600158796493465, 0.03607298428191439, 0.03278982282299715, 0.08320630873701282]]],)

Performance curves

Generate a plot of performance, as a function of some hyperparameter (building on the preceding example):

r = range(forest_model, :n, lower=1, upper=1000, scale=:log10)
curve = MLJ.learning_curve!(forest,
                            range=r,
                            resampling=Holdout(),
                            measure=cross_entropy,
                            n=4,
                            verbosity=0)
(parameter_name = "n",
 parameter_scale = :log10,
 parameter_values = [1, 2, 3, 4, 5, 7, 9, 11, 14, 17  …  117, 149, 189, 240, 304, 386, 489, 621, 788, 1000],
 measurements = [1.5795422129957846 0.4877169964032245 0.5787024311192711 1.1246150394155512; 0.9260687819890557 0.4877169964032245 0.6388462425511587 0.5929686664090277; … ; 0.5566886542022589 0.5693696710993609 0.5507729777771561 0.543633302075341; 0.5548894463467063 0.5623760569783312 0.55498243553424 0.5455251265631544],)
using Plots
plot(curve.parameter_values, curve.measurements, xlab=curve.parameter_name, xscale=curve.parameter_scale)