Common MLJ Workflows

Data ingestion

using RDatasets
channing = dataset("boot", "channing")
first(channing, 4)

4 rows × 5 columns

	Sex	Entry	Exit	Time	Cens
	Categorical…	Int32	Int32	Int32	Int32
1	Male	782	909	127	1
2	Male	1020	1128	108	1
3	Male	856	969	113	1
4	Male	915	957	42	1

Inspecting metadata, including column scientific types:

schema(channing)

┌─────────┬──────────────────────────┬───────────────┐
│ _.names │ _.types                  │ _.scitypes    │
├─────────┼──────────────────────────┼───────────────┤
│ Sex     │ CategoricalString{UInt8} │ Multiclass{2} │
│ Entry   │ Int32                    │ Count         │
│ Exit    │ Int32                    │ Count         │
│ Time    │ Int32                    │ Count         │
│ Cens    │ Int32                    │ Count         │
└─────────┴──────────────────────────┴───────────────┘
_.nrows = 462

Unpacking data and correcting for wrong scitypes:

y, X =  unpack(channing,
               ==(:Exit),            # y is the :Exit column
               !=(:Time);            # X is the rest, except :Time
               :Exit=>Continuous,
               :Entry=>Continuous,
               :Cens=>Multiclass)
first(X, 4)

4 rows × 3 columns

	Sex	Entry	Cens
	Categorical…	Float64	Categorical…
1	Male	782.0	1
2	Male	1020.0	1
3	Male	856.0	1
4	Male	915.0	1

Note: Before julia 1.2, replace !=(:Time) with col -> col != :Time.

y[1:4]

4-element Array{Float64,1}:
  909.0
 1128.0
  969.0
  957.0

Loading a built-in supervised dataset:

X, y = @load_iris;
selectrows(X, 1:4) # selectrows works for any Tables.jl table

(sepal_length = [5.1, 4.9, 4.7, 4.6],
 sepal_width = [3.5, 3.0, 3.2, 3.1],
 petal_length = [1.4, 1.4, 1.3, 1.5],
 petal_width = [0.2, 0.2, 0.2, 0.2],)

y[1:4]

4-element CategoricalArray{String,1,UInt32}:
 "setosa"
 "setosa"
 "setosa"
 "setosa"

Model search (experimental)

Reference: Model Search

Searching for a supervised model:

X, y = @load_boston
models(matching(X, y))

52-element Array{NamedTuple{(:name, :package_name, :is_supervised, :docstring, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :is_pure_julia, :is_wrapper, :load_path, :package_license, :package_url, :package_uuid, :prediction_type, :supports_online, :supports_weights, :input_scitype, :target_scitype, :output_scitype),T} where T<:Tuple,1}:
 (name = ARDRegressor, package_name = ScikitLearn, ... )                
 (name = AdaBoostRegressor, package_name = ScikitLearn, ... )           
 (name = BaggingRegressor, package_name = ScikitLearn, ... )            
 (name = BayesianRidgeRegressor, package_name = ScikitLearn, ... )      
 (name = ConstantRegressor, package_name = MLJModels, ... )             
 (name = DecisionTreeRegressor, package_name = DecisionTree, ... )      
 (name = DeterministicConstantRegressor, package_name = MLJModels, ... )
 (name = DummyRegressor, package_name = ScikitLearn, ... )              
 (name = ElasticNetCVRegressor, package_name = ScikitLearn, ... )       
 (name = ElasticNetRegressor, package_name = MLJLinearModels, ... )     
 ⋮                                                                      
 (name = RidgeRegressor, package_name = MultivariateStats, ... )        
 (name = RidgeRegressor, package_name = ScikitLearn, ... )              
 (name = RobustRegressor, package_name = MLJLinearModels, ... )         
 (name = SGDRegressor, package_name = ScikitLearn, ... )                
 (name = SVMLRegressor, package_name = ScikitLearn, ... )               
 (name = SVMNuRegressor, package_name = ScikitLearn, ... )              
 (name = SVMRegressor, package_name = ScikitLearn, ... )                
 (name = TheilSenRegressor, package_name = ScikitLearn, ... )           
 (name = XGBoostRegressor, package_name = XGBoost, ... )

models(matching(X, y))[6]

CART decision tree regressor.
→ based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl).
→ do `@load DecisionTreeRegressor pkg="DecisionTree"` to use the model.
→ do `?DecisionTreeRegressor` for documentation.
(name = "DecisionTreeRegressor",
 package_name = "DecisionTree",
 is_supervised = true,
 docstring = "CART decision tree regressor.\n→ based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl).\n→ do `@load DecisionTreeRegressor pkg=\"DecisionTree\"` to use the model.\n→ do `?DecisionTreeRegressor` for documentation.",
 hyperparameter_ranges = (nothing, nothing, nothing, nothing, nothing, nothing, nothing),
 hyperparameter_types = ("Int64", "Int64", "Int64", "Float64", "Int64", "Bool", "Float64"),
 hyperparameters = (:max_depth, :min_samples_leaf, :min_samples_split, :min_purity_increase, :n_subfeatures, :post_prune, :merge_purity_threshold),
 implemented_methods = Symbol[:predict, :clean!, :fit, :fitted_params],
 is_pure_julia = true,
 is_wrapper = false,
 load_path = "MLJModels.DecisionTree_.DecisionTreeRegressor",
 package_license = "MIT",
 package_url = "https://github.com/bensadeghi/DecisionTree.jl",
 package_uuid = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb",
 prediction_type = :deterministic,
 supports_online = false,
 supports_weights = false,
 input_scitype = Table{_s23} where _s23<:Union{AbstractArray{_s25,1} where _s25<:Continuous, AbstractArray{_s25,1} where _s25<:Count, AbstractArray{_s25,1} where _s25<:OrderedFactor},
 target_scitype = AbstractArray{Continuous,1},
 output_scitype = Unknown,)

Instantiating a model

Reference: Getting Started

@load DecisionTreeClassifier
model = DecisionTreeClassifier(min_samples_split=5, max_depth=4)

DecisionTreeClassifier(
    max_depth = 4,
    min_samples_leaf = 1,
    min_samples_split = 5,
    min_purity_increase = 0.0,
    n_subfeatures = 0,
    post_prune = false,
    merge_purity_threshold = 1.0,
    pdf_smoothing = 0.0,
    display_depth = 5) @ 4…05

model = @load DecisionTreeClassifier
model.min_samples_split = 5
model.max_depth = 4

Evaluating a model

Reference: Evaluating Model Performance

X, y = @load_boston
model = @load KNNRegressor
evaluate(model, X, y, resampling=CV(nfolds=5), measure=[rms, mav])

┌───────────┬───────────────┬────────────────────────────────┐
│ _.measure │ _.measurement │ _.per_fold                     │
├───────────┼───────────────┼────────────────────────────────┤
│ rms       │ 8.82          │ [8.53, 8.52, 10.7, 9.39, 6.32] │
│ mae       │ 6.07          │ [6.49, 5.43, 7.61, 6.03, 4.79] │
└───────────┴───────────────┴────────────────────────────────┘
_.per_observation = [missing, missing]

Basic fit/evaluate/predict by hand:

Reference: Getting Started, Machines, Evaluating Model Performance, Performance Measures

using RDatasets
vaso = dataset("robustbase", "vaso"); # a DataFrame
first(vaso, 3)

3 rows × 3 columns

	Volume	Rate	Y
	Float64	Float64	Int64
1	3.7	0.825	1
2	3.5	1.09	1
3	1.25	2.5	1

y, X = unpack(vaso, ==(:Y), c -> true; :Y => Multiclass)

tree_model = @load DecisionTreeClassifier

┌ Info: A model type "DecisionTreeClassifier" is already loaded.
└ No new code loaded.

Bind the model and data together in a machine , which will additionally store the learned parameters (fitresults) when fit:

tree = machine(tree_model, X, y)

Machine{DecisionTreeClassifier} @ 5…31

Split row indices into training and evaluation rows:

train, test = partition(eachindex(y), 0.7, shuffle=true, rng=1234); # 70:30 split

([27, 28, 30, 31, 32, 18, 21, 9, 26, 14  …  7, 39, 2, 37, 1, 8, 19, 25, 35, 34], [22, 13, 11, 4, 10, 16, 3, 20, 29, 23, 12, 24])

Fit on train and evaluate on test:

fit!(tree, rows=train)
yhat = predict(tree, rows=test);
mean(cross_entropy(yhat, y[test]))

6.5216583816514975

Predict on new data:

Xnew = (Volume=3*rand(3), Rate=3*rand(3))
predict(tree, Xnew)      # a vector of distributions

3-element Array{UnivariateFinite{Int64,UInt32,Float64},1}:
 UnivariateFinite(0=>0.0, 1=>1.0)
 UnivariateFinite(0=>0.9, 1=>0.1)
 UnivariateFinite(0=>0.0, 1=>1.0)

predict_mode(tree, Xnew) # a vector of point-predictions

3-element CategoricalArray{Int64,1,UInt32}:
 1
 0
 1

More performance evaluation examples

import LossFunctions.ZeroOneLoss

Evaluating model + data directly:

evaluate(tree_model, X, y,
         resampling=Holdout(fraction_train=0.7, shuffle=true, rng=1234),
         measure=[cross_entropy, ZeroOneLoss()])

┌───────────────┬───────────────┬────────────┐
│ _.measure     │ _.measurement │ _.per_fold │
├───────────────┼───────────────┼────────────┤
│ cross_entropy │ 6.52          │ [6.52]     │
│ ZeroOneLoss   │ 0.417         │ [0.417]    │
└───────────────┴───────────────┴────────────┘
_.per_observation = [[[0.105, 36.0, ..., 1.3]], [[0.0, 1.0, ..., 1.0]]]

If a machine is already defined, as above:

evaluate!(tree,
          resampling=Holdout(fraction_train=0.7, shuffle=true, rng=1234),
          measure=[cross_entropy, ZeroOneLoss()])

┌───────────────┬───────────────┬────────────┐
│ _.measure     │ _.measurement │ _.per_fold │
├───────────────┼───────────────┼────────────┤
│ cross_entropy │ 6.52          │ [6.52]     │
│ ZeroOneLoss   │ 0.417         │ [0.417]    │
└───────────────┴───────────────┴────────────┘
_.per_observation = [[[0.105, 36.0, ..., 1.3]], [[0.0, 1.0, ..., 1.0]]]

Using cross-validation:

evaluate!(tree, resampling=CV(nfolds=5, shuffle=true, rng=1234),
          measure=[cross_entropy, ZeroOneLoss()])

┌───────────────┬───────────────┬───────────────────────────────────┐
│ _.measure     │ _.measurement │ _.per_fold                        │
├───────────────┼───────────────┼───────────────────────────────────┤
│ cross_entropy │ 3.91          │ [10.6, 0.676, 0.495, 0.717, 7.11] │
│ ZeroOneLoss   │ 0.377         │ [0.571, 0.429, 0.0, 0.429, 0.455] │
└───────────────┴───────────────┴───────────────────────────────────┘
_.per_observation = [[[2.22e-16, 0.944, ..., 0.944], [1.23, 2.22e-16, ..., 0.345], [0.693, 0.693, ..., 0.693], [0.363, 1.19, ..., 1.19], [36.0, 0.0953, ..., 1.3]], [[0.0, 1.0, ..., 1.0], [1.0, 0.0, ..., 0.0], [0.0, 0.0, ..., 0.0], [0.0, 1.0, ..., 1.0], [1.0, 0.0, ..., 1.0]]]

With user-specified train/test pairs of row indices:

f1, f2, f3 = 1:13, 14:26, 27:36
pairs = [(f1, vcat(f2, f3)), (f2, vcat(f3, f1)), (f3, vcat(f1, f2))];
evaluate!(tree,
          resampling=pairs,
          measure=[cross_entropy, ZeroOneLoss()])

┌───────────────┬───────────────┬───────────────────────┐
│ _.measure     │ _.measurement │ _.per_fold            │
├───────────────┼───────────────┼───────────────────────┤
│ cross_entropy │ 5.88          │ [2.16, 11.0, 4.51]    │
│ ZeroOneLoss   │ 0.241         │ [0.304, 0.304, 0.115] │
└───────────────┴───────────────┴───────────────────────┘
_.per_observation = [[[0.154, 0.154, ..., 0.154], [2.22e-16, 36.0, ..., 2.22e-16], [2.22e-16, 2.22e-16, ..., 0.693]], [[0.0, 0.0, ..., 0.0], [0.0, 1.0, ..., 0.0], [0.0, 0.0, ..., 0.0]]]

Changing a hyperparameter and re-evaluating:

tree_model.max_depth = 3
evaluate!(tree,
          resampling=CV(nfolds=5, shuffle=true, rng=1234),
          measure=[cross_entropy, ZeroOneLoss()])

┌───────────────┬───────────────┬─────────────────────────────────────┐
│ _.measure     │ _.measurement │ _.per_fold                          │
├───────────────┼───────────────┼─────────────────────────────────────┤
│ cross_entropy │ 5.47          │ [10.5, 0.532, 0.389, 5.63, 10.3]    │
│ ZeroOneLoss   │ 0.377         │ [0.429, 0.429, 0.143, 0.429, 0.455] │
└───────────────┴───────────────┴─────────────────────────────────────┘
_.per_observation = [[[2.22e-16, 1.32, ..., 2.22e-16], [0.887, 2.22e-16, ..., 2.22e-16], [0.405, 0.405, ..., 1.1], [2.22e-16, 0.827, ..., 0.827], [36.0, 0.288, ..., 2.2]], [[0.0, 1.0, ..., 0.0], [1.0, 0.0, ..., 0.0], [0.0, 0.0, ..., 1.0], [0.0, 1.0, ..., 1.0], [1.0, 0.0, ..., 1.0]]]

Inspecting training results

Fit a ordinary least square model to some synthetic data:

x1 = rand(100)
x2 = rand(100)

X = (x1=x1, x2=x2)
y = x1 - 2x2 + 0.1*rand(100);

ols_model = @load LinearRegressor pkg=GLM
ols =  machine(ols_model, X, y)
fit!(ols)

Machine{LinearRegressor} @ 1…73

Get a named tuple representing the learned parameters, human-readable if appropriate:

fitted_params(ols)

(coef = [0.9951231615869639, -1.9982604285152694],
 intercept = 0.05120020356997971,)

Get other training-related information:

report(ols)

(deviance = 0.093643092195193,
 dof_residual = 97.0,
 stderror = [0.01186834965189999, 0.011300420378116304, 0.007956683459766484],
 vcov = [0.00014085772345975465 5.046026380802417e-7 -6.536840735736132e-5; 5.046026380802417e-7 0.00012769950072214625 -5.480335910558171e-5; -6.536840735736132e-5 -5.480335910558171e-5 6.330881167892154e-5],)

Basic fit/transform for unsupervised models

Load data:

X, y = @load_iris
train, test = partition(eachindex(y), 0.97, shuffle=true, rng=123)

([125, 100, 130, 9, 70, 148, 39, 64, 6, 107  …  110, 59, 139, 21, 112, 144, 140, 72, 109, 41], [106, 147, 47, 5])

Instantiate and fit the model/machine:

@load PCA
pca_model = PCA(maxoutdim=2)
pca = machine(pca_model, X)
fit!(pca, rows=train)

Machine{PCA} @ 1…58

Transform selected data bound to the machine:

transform(pca, rows=test);

(x1 = [-3.3942826854483243, -1.5219827578765068, 2.538247455185219, 2.7299639893931373],
 x2 = [0.5472450223745241, -0.36842368617126214, 0.5199299511335698, 0.3448466122232363],)

Transform new data:

Xnew = (sepal_length=rand(3), sepal_width=rand(3),
        petal_length=rand(3), petal_width=rand(3));
transform(pca, Xnew)

(x1 = [5.073591843762576, 4.880614680411594, 5.012048734163007],
 x2 = [-4.45492341206699, -4.759206557916316, -4.286369274636478],)

Inverting learned transformations

y = rand(100);
stand_model = UnivariateStandardizer()
stand = machine(stand_model, y)
fit!(stand)
z = transform(stand, y);
@assert inverse_transform(stand, z) ≈ y # true

[ Info: Training Machine{UnivariateStandardizer} @ 7…91.

Nested hyperparameter tuning

Reference: Tuning Models

Define a model with nested hyperparameters:

tree_model = @load DecisionTreeClassifier
forest_model = EnsembleModel(atom=tree_model, n=300)

ProbabilisticEnsembleModel(
    atom = DecisionTreeClassifier(
            max_depth = -1,
            min_samples_leaf = 1,
            min_samples_split = 2,
            min_purity_increase = 0.0,
            n_subfeatures = 0,
            post_prune = false,
            merge_purity_threshold = 1.0,
            pdf_smoothing = 0.0,
            display_depth = 5),
    atomic_weights = Float64[],
    bagging_fraction = 0.8,
    rng = MersenneTwister(UInt32[0xcb2d5333, 0x992f6123, 0x54f474c8, 0x9a010a5b]) @ 976,
    n = 300,
    acceleration = CPU1{Nothing}(nothing),
    out_of_bag_measure = Any[]) @ 9…42

Inspect all hyperparameters, even nested ones (returns nested named tuple):

params(forest_model)

(atom = (max_depth = -1,
         min_samples_leaf = 1,
         min_samples_split = 2,
         min_purity_increase = 0.0,
         n_subfeatures = 0,
         post_prune = false,
         merge_purity_threshold = 1.0,
         pdf_smoothing = 0.0,
         display_depth = 5,),
 atomic_weights = Float64[],
 bagging_fraction = 0.8,
 rng = MersenneTwister(UInt32[0xcb2d5333, 0x992f6123, 0x54f474c8, 0x9a010a5b]) @ 976,
 n = 300,
 acceleration = CPU1{Nothing}(nothing),
 out_of_bag_measure = Any[],)

Define ranges for hyperparameters to be tuned:

r1 = range(forest_model, :bagging_fraction, lower=0.5, upper=1.0, scale=:log10)

MLJBase.NumericRange(Float64, :bagging_fraction, ... )

r2 = range(forest_model, :(atom.n_subfeatures), lower=1, upper=4) # nested

MLJBase.NumericRange(Int64, :(atom.n_subfeatures), ... )

Wrap the model in a tuning strategy:

tuned_forest = TunedModel(model=forest_model,
                          tuning=Grid(resolution=12),
                          resampling=CV(nfolds=6),
                          ranges=[r1, r2],
                          measure=cross_entropy)

ProbabilisticTunedModel(
    model = ProbabilisticEnsembleModel(
            atom = DecisionTreeClassifier @ 9…68,
            atomic_weights = Float64[],
            bagging_fraction = 0.8,
            rng = MersenneTwister(UInt32[0xcb2d5333, 0x992f6123, 0x54f474c8, 0x9a010a5b]) @ 976,
            n = 300,
            acceleration = CPU1{Nothing}(nothing),
            out_of_bag_measure = Any[]),
    tuning = Grid(
            goal = nothing,
            resolution = 12,
            shuffle = true,
            rng = MersenneTwister(UInt32[0xcb2d5333, 0x992f6123, 0x54f474c8, 0x9a010a5b]) @ 976),
    resampling = CV(
            nfolds = 6,
            shuffle = false,
            rng = MersenneTwister(UInt32[0xcb2d5333, 0x992f6123, 0x54f474c8, 0x9a010a5b]) @ 976),
    measure = cross_entropy(
            eps = 2.220446049250313e-16),
    weights = nothing,
    operation = MLJModelInterface.predict,
    range = MLJBase.NumericRange{T,MLJBase.Bounded,Symbol} where T[NumericRange{Float64,…} @ 1…98, NumericRange{Int64,…} @ 1…59],
    train_best = true,
    repeats = 1,
    n = nothing,
    acceleration = CPU1{Nothing}(nothing),
    acceleration_resampling = CPU1{Nothing}(nothing),
    check_measure = true) @ 1…27

Bound the wrapped model to data:

tuned = machine(tuned_forest, X, y)

Machine{ProbabilisticTunedModel{Grid,…}} @ 1…46

Fitting the resultant machine optimizes the hyperparameters specified in range, using the specified tuning and resampling strategies and performance measure (possibly a vector of measures), and retrains on all data bound to the machine:

fit!(tuned)

Machine{ProbabilisticTunedModel{Grid,…}} @ 1…46

Inspecting the optimal model:

F = fitted_params(tuned)

(best_model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 1…98,
 best_fitted_params = (fitresult = WrappedEnsemble{Tuple{Node{Float64,…},…},…} @ 8…18,),)

F.best_model

ProbabilisticEnsembleModel(
    atom = DecisionTreeClassifier(
            max_depth = -1,
            min_samples_leaf = 1,
            min_samples_split = 2,
            min_purity_increase = 0.0,
            n_subfeatures = 3,
            post_prune = false,
            merge_purity_threshold = 1.0,
            pdf_smoothing = 0.0,
            display_depth = 5),
    atomic_weights = Float64[],
    bagging_fraction = 0.5,
    rng = MersenneTwister(UInt32[0xcb2d5333, 0x992f6123, 0x54f474c8, 0x9a010a5b]) @ 465,
    n = 300,
    acceleration = CPU1{Nothing}(nothing),
    out_of_bag_measure = Any[]) @ 1…98

Inspecting details of tuning procedure:

report(tuned)

(best_model = ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 1…98,
 best_result = (measure = MLJBase.CrossEntropy{Float64}[cross_entropy],
                measurement = [0.14623156150288902],),
 best_report = (measures = Any[],
                oob_measurements = missing,),
 history = Tuple{MLJ.ProbabilisticEnsembleModel{MLJModels.DecisionTree_.DecisionTreeClassifier},NamedTuple{(:measure, :measurement),Tuple{Array{MLJBase.CrossEntropy{Float64},1},Array{Float64,1}}}}[(ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 6…67, (measure = [cross_entropy], measurement = [0.16927051227777315])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 1…29, (measure = [cross_entropy], measurement = [0.1613564824906855])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 5…34, (measure = [cross_entropy], measurement = [0.1835061400032578])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 1…94, (measure = [cross_entropy], measurement = [0.19889579958544434])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 5…54, (measure = [cross_entropy], measurement = [0.649690071500116])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 9…18, (measure = [cross_entropy], measurement = [0.15307565263784192])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 4…11, (measure = [cross_entropy], measurement = [1.0805845402784882])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 1…74, (measure = [cross_entropy], measurement = [0.17083354828403227])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 8…38, (measure = [cross_entropy], measurement = [0.20495486267335197])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 2…42, (measure = [cross_entropy], measurement = [0.21303707205745295]))  …  (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 1…98, (measure = [cross_entropy], measurement = [0.14623156150288902])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 1…56, (measure = [cross_entropy], measurement = [0.42787644661140295])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 1…44, (measure = [cross_entropy], measurement = [0.16181178533487328])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 6…40, (measure = [cross_entropy], measurement = [1.1177926113560381])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 3…79, (measure = [cross_entropy], measurement = [0.19794996382846355])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 1…13, (measure = [cross_entropy], measurement = [0.18285085143151514])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 1…94, (measure = [cross_entropy], measurement = [0.8625213051024527])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 2…72, (measure = [cross_entropy], measurement = [0.16554114284039653])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 7…84, (measure = [cross_entropy], measurement = [0.2075469082653952])), (ProbabilisticEnsembleModel{DecisionTreeClassifier} @ 2…95, (measure = [cross_entropy], measurement = [0.20333021669457318]))],
 plotting = (parameter_names = ["bagging_fraction", "atom.n_subfeatures"],
             parameter_scales = Symbol[:log10, :linear],
             parameter_values = Any[0.5671562610977313 4; 0.5 2; … ; 0.7297400528407231 4; 0.7297400528407231 1],
             measurements = [0.16927051227777315, 0.1613564824906855, 0.1835061400032578, 0.19889579958544434, 0.649690071500116, 0.15307565263784192, 1.0805845402784882, 0.17083354828403227, 0.20495486267335197, 0.21303707205745295  …  0.14623156150288902, 0.42787644661140295, 0.16181178533487328, 1.1177926113560381, 0.19794996382846355, 0.18285085143151514, 0.8625213051024527, 0.16554114284039653, 0.2075469082653952, 0.20333021669457318],),)

Visualizing these results:

using Plots
plot(tuned)

Predicting on new data using the optimized model:

predict(tuned, Xnew)

3-element Array{UnivariateFinite{String,UInt32,Float64},1}:
 UnivariateFinite(setosa=>1.0, versicolor=>0.0, virginica=>0.0)
 UnivariateFinite(setosa=>1.0, versicolor=>0.0, virginica=>0.0)
 UnivariateFinite(setosa=>1.0, versicolor=>0.0, virginica=>0.0)

Constructing a linear pipeline

Reference: Composing Models

Constructing a linear (unbranching) pipeline with a learned target transformation/inverse transformation:

X, y = @load_reduced_ames
@load KNNRegressor
pipe = @pipeline MyPipe(X -> coerce(X, :age=>Continuous),
                               hot = OneHotEncoder(),
                               knn = KNNRegressor(K=3),
                               target = UnivariateStandardizer())

MyPipe(
    hot = OneHotEncoder(
            features = Symbol[],
            drop_last = false,
            ordered_factor = true),
    knn = KNNRegressor(
            K = 3,
            algorithm = :kdtree,
            metric = Distances.Euclidean(0.0),
            leafsize = 10,
            reorder = true,
            weights = :uniform),
    target = UnivariateStandardizer()) @ 1…91

Evaluating the pipeline (just as you would any other model):

pipe.knn.K = 2
pipe.hot.drop_last = true
evaluate(pipe, X, y, resampling=Holdout(), measure=rms, verbosity=2)

┌───────────┬───────────────┬────────────┐
│ _.measure │ _.measurement │ _.per_fold │
├───────────┼───────────────┼────────────┤
│ rms       │ 53100.0       │ [53100.0]  │
└───────────┴───────────────┴────────────┘
_.per_observation = [missing]

Constructing a linear (unbranching) pipeline with a static (unlearned) target transformation/inverse transformation:

@load DecisionTreeRegressor
pipe2 = @pipeline MyPipe2(X -> coerce(X, :age=>Continuous),
                               hot = OneHotEncoder(),
                               tree = DecisionTreeRegressor(max_depth=4),
                               target = y -> log.(y),
                               inverse = z -> exp.(z))

MyPipe2(
    hot = OneHotEncoder(
            features = Symbol[],
            drop_last = false,
            ordered_factor = true),
    tree = DecisionTreeRegressor(
            max_depth = 4,
            min_samples_leaf = 5,
            min_samples_split = 2,
            min_purity_increase = 0.0,
            n_subfeatures = 0,
            post_prune = false,
            merge_purity_threshold = 1.0),
    target = StaticTransformer(
            f = getfield(Main.ex-workflows, Symbol("##24#25"))()),
    inverse = StaticTransformer(
            f = getfield(Main.ex-workflows, Symbol("##26#27"))())) @ 1…62

Creating a homogeneous ensemble of models

Reference: Homogeneous Ensembles

X, y = @load_iris
tree_model = @load DecisionTreeClassifier
forest_model = EnsembleModel(atom=tree_model, bagging_fraction=0.8, n=300)
forest = machine(forest_model, X, y)
evaluate!(forest, measure=cross_entropy)

┌───────────────┬───────────────┬─────────────────────────────────────────────────┐
│ _.measure     │ _.measurement │ _.per_fold                                      │
├───────────────┼───────────────┼─────────────────────────────────────────────────┤
│ cross_entropy │ 0.419         │ [3.66e-15, 3.66e-15, 0.269, 1.61, 0.344, 0.289] │
└───────────────┴───────────────┴─────────────────────────────────────────────────┘
_.per_observation = [[[3.66e-15, 3.66e-15, ..., 3.66e-15], [3.66e-15, 3.66e-15, ..., 3.66e-15], [0.0339, 3.66e-15, ..., 3.66e-15], [3.66e-15, 0.163, ..., 3.66e-15], [3.66e-15, 0.0339, ..., 3.66e-15], [0.0236, 0.366, ..., 0.0305]]]

Performance curves

Generate a plot of performance, as a function of some hyperparameter (building on the preceding example)

Single performance curve:

r = range(forest_model, :n, lower=1, upper=1000, scale=:log10)
curve = learning_curve(forest,
                            range=r,
                            resampling=Holdout(),
                            resolution=50,
                            measure=cross_entropy,
                            verbosity=0)

(parameter_name = "n",
 parameter_scale = :log10,
 parameter_values = [1, 2, 3, 4, 5, 6, 7, 8, 10, 11  …  281, 324, 373, 429, 494, 569, 655, 754, 869, 1000],
 measurements = [16.820371581588002, 16.820371581588002, 16.820371581588002, 4.542505582903701, 4.410104400487424, 4.359125602188101, 4.335626768339178, 4.324536217435546, 2.843674538223767, 2.850534048730983  …  1.2368416614697912, 1.2376020550200864, 1.2411683060318237, 1.243532413157902, 1.246578117905889, 1.2450011906248826, 1.2443232118467873, 1.2405680841310842, 1.23505800404394, 1.2353673200913653],)

using Plots
plot(curve.parameter_values, curve.measurements, xlab=curve.parameter_name, xscale=curve.parameter_scale)

Multiple curves:

curve = learning_curve(forest,
                       range=r,
                       resampling=Holdout(),
                       measure=cross_entropy,
                       resolution=50,
                       rng_name=:rng,
                       rngs=4,
                       verbosity=0)

(parameter_name = "n",
 parameter_scale = :log10,
 parameter_values = [1, 2, 3, 4, 5, 6, 7, 8, 10, 11  …  281, 324, 373, 429, 494, 569, 655, 754, 869, 1000],
 measurements = [8.009700753137146 4.004850376568572 15.218431430960575 4.004850376568572; 8.009700753137146 4.004850376568572 15.218431430960575 4.004850376568572; … ; 1.1769495696022239 1.208025772044022 1.2510785308967738 1.247939404419113; 1.1819626207229375 1.2120729054037578 1.2514821809010175 1.2486239380533115],)

plot(curve.parameter_values, curve.measurements,
xlab=curve.parameter_name, xscale=curve.parameter_scale)