Effect of `ratios` Hyperparameter

using Random
using CSV
using DataFrames
using MLJ
using ScientificTypes
using Imbalance
using Plots, Measures

Loading Data

Let's load the Iris dataset, the objective of this dataset is to predict the type of flower as one of "virginica", "versicolor" and "setosa" using its sepal and petal length and width.

We don't need to so from a CSV file this time because MLJ has a macro for loading it already! The only difference is that we will need to explictly convert it to a dataframe as MLJ loads it as a named tuple of vectors.

X, y = @load_iris
X = DataFrame(X)
first(X, 5) |> pretty

┌──────────────┬─────────────┬──────────────┬─────────────┐
│ sepal_length │ sepal_width │ petal_length │ petal_width │
│ Float64      │ Float64     │ Float64      │ Float64     │
│ Continuous   │ Continuous  │ Continuous   │ Continuous  │
├──────────────┼─────────────┼──────────────┼─────────────┤
│ 5.1          │ 3.5         │ 1.4          │ 0.2         │
│ 4.9          │ 3.0         │ 1.4          │ 0.2         │
│ 4.7          │ 3.2         │ 1.3          │ 0.2         │
│ 4.6          │ 3.1         │ 1.5          │ 0.2         │
│ 5.0          │ 3.6         │ 1.4          │ 0.2         │
└──────────────┴─────────────┴──────────────┴─────────────┘

Our purpose for this tutorial is primarily visuallization. Thus, let's select two of the continuous features only to work with. It's known that the sepal length and width play a much bigger role in classifying the type of flower so let's keep those only.

X = select(X, :petal_width, :petal_length)
first(X, 5) |> pretty

┌─────────────┬──────────────┐
│ petal_width │ petal_length │
│ Float64     │ Float64      │
│ Continuous  │ Continuous   │
├─────────────┼──────────────┤
│ 0.2         │ 1.4          │
│ 0.2         │ 1.4          │
│ 0.2         │ 1.3          │
│ 0.2         │ 1.5          │
│ 0.2         │ 1.4          │
└─────────────┴──────────────┘

Coercing Data

ScientificTypes.schema(X)

┌──────────────┬────────────┬─────────┐
│ names        │ scitypes   │ types   │
├──────────────┼────────────┼─────────┤
│ petal_width  │ Continuous │ Float64 │
│ petal_length │ Continuous │ Float64 │
└──────────────┴────────────┴─────────┘

Things look good, no coercion is needed.

Oversampling

Iris, by default has no imbalance problem

checkbalance(y)

virginica:  ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 50 (100.0%) 
setosa:     ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 50 (100.0%) 
versicolor: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 50 (100.0%)

To simulate that there is a balance problem, we will consider a random sample of 100 observations. A random sample does not guarantee perserving the proportion of classes; in this, we actually set the seed to get a very unlikely random sample that suffers from moderate imbalance.

Random.seed!(803429)
subset_indices = rand(1:size(X, 1), 100)
X, y = X[subset_indices, :], y[subset_indices]
checkbalance(y)         # comes from Imbalance

versicolor: ▇▇▇▇▇▇▇▇▇▇▇ 12 (22.6%) 
setosa:     ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 35 (66.0%) 
virginica:  ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 53 (100.0%)

We will treat this as our training set going forward so we don't need to partition. Now let's oversample it with SMOTE.

Xover, yover = smote(X, y; k=5, ratios=Dict("versicolor" => 0.7), rng=42)
checkbalance(yover)

setosa:     ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 35 (66.0%) 
versicolor: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 37 (69.8%) 
virginica:  ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 53 (100.0%)

Training the Model

models(matching(Xover, yover))

53-element Vector{NamedTuple{(:name, :package_name, :is_supervised, :abstract_type, :deep_properties, :docstring, :fit_data_scitype, :human_name, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :inverse_transform_scitype, :is_pure_julia, :is_wrapper, :iteration_parameter, :load_path, :package_license, :package_url, :package_uuid, :predict_scitype, :prediction_type, :reporting_operations, :reports_feature_importances, :supports_class_weights, :supports_online, :supports_training_losses, :supports_weights, :transform_scitype, :input_scitype, :target_scitype, :output_scitype)}}:
 (name = AdaBoostClassifier, package_name = MLJScikitLearnInterface, ... )
 (name = AdaBoostStumpClassifier, package_name = DecisionTree, ... )
 (name = BaggingClassifier, package_name = MLJScikitLearnInterface, ... )
 (name = BayesianLDA, package_name = MLJScikitLearnInterface, ... )
 (name = BayesianLDA, package_name = MultivariateStats, ... )
 (name = BayesianQDA, package_name = MLJScikitLearnInterface, ... )
 (name = BayesianSubspaceLDA, package_name = MultivariateStats, ... )
 (name = CatBoostClassifier, package_name = CatBoost, ... )
 (name = ConstantClassifier, package_name = MLJModels, ... )
 (name = DecisionTreeClassifier, package_name = BetaML, ... )
 ⋮
 (name = SGDClassifier, package_name = MLJScikitLearnInterface, ... )
 (name = SVC, package_name = LIBSVM, ... )
 (name = SVMClassifier, package_name = MLJScikitLearnInterface, ... )
 (name = SVMLinearClassifier, package_name = MLJScikitLearnInterface, ... )
 (name = SVMNuClassifier, package_name = MLJScikitLearnInterface, ... )
 (name = StableForestClassifier, package_name = SIRUS, ... )
 (name = StableRulesClassifier, package_name = SIRUS, ... )
 (name = SubspaceLDA, package_name = MultivariateStats, ... )
 (name = XGBoostClassifier, package_name = XGBoost, ... )

Let's go for an SVM

import Pkg;
Pkg.add("MLJLIBSVMInterface");

Before Oversampling

# 1. Load the model
SVC = @load SVC pkg = LIBSVM

# 2. Instantiate it (γ=0.01 is intentional)
model = SVC(gamma=0.01)

# 3. Wrap it with the data in a machine
mach = machine(model, X, y)

# 4. fit the machine learning model
fit!(mach)

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main /Users/essam/.julia/packages/MLJModels/EkXIe/src/loading.jl:159


import MLJLIBSVMInterface ✔


┌ Info: Training machine(SVC(kernel = RadialBasis, …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492



trained Machine; caches model-specific representations of data
  model: SVC(kernel = RadialBasis, …)
  args: 
    1:	Source @527 ⏎ Table{AbstractVector{Continuous}}
    2:	Source @580 ⏎ AbstractVector{Multiclass{3}}

After Oversampling

# 3. Wrap it with the data in a machine
mach_over = machine(model, Xover, yover)

# 4. fit the machine learning model
fit!(mach_over)

┌ Info: Training machine(SVC(kernel = RadialBasis, …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492



trained Machine; caches model-specific representations of data
  model: SVC(kernel = RadialBasis, …)
  args: 
    1:	Source @277 ⏎ Table{AbstractVector{Continuous}}
    2:	Source @977 ⏎ AbstractVector{Multiclass{3}}

Plot Decision Boundaries

Construct ranges for each feature and consecutively a grid

petal_width_range =
	range(minimum(X.petal_width) - 1, maximum(X.petal_width) + 1, length = 200)
petal_length_range =
	range(minimum(X.petal_length) - 1, maximum(X.petal_length) + 1, length = 200)
grid_points = [(pw, pl) for pw in petal_width_range, pl in petal_length_range]

200×200 Matrix{Tuple{Float64, Float64}}:
 (-0.9, 0.2)       (-0.9, 0.238693)       …  (-0.9, 7.9)
 (-0.878894, 0.2)  (-0.878894, 0.238693)     (-0.878894, 7.9)
 (-0.857789, 0.2)  (-0.857789, 0.238693)     (-0.857789, 7.9)
 (-0.836683, 0.2)  (-0.836683, 0.238693)     (-0.836683, 7.9)
 (-0.815578, 0.2)  (-0.815578, 0.238693)     (-0.815578, 7.9)
 (-0.794472, 0.2)  (-0.794472, 0.238693)  …  (-0.794472, 7.9)
 (-0.773367, 0.2)  (-0.773367, 0.238693)     (-0.773367, 7.9)
 (-0.752261, 0.2)  (-0.752261, 0.238693)     (-0.752261, 7.9)
 (-0.731156, 0.2)  (-0.731156, 0.238693)     (-0.731156, 7.9)
 (-0.71005, 0.2)   (-0.71005, 0.238693)      (-0.71005, 7.9)
 ⋮                                        ⋱  
 (3.13116, 0.2)    (3.13116, 0.238693)       (3.13116, 7.9)
 (3.15226, 0.2)    (3.15226, 0.238693)       (3.15226, 7.9)
 (3.17337, 0.2)    (3.17337, 0.238693)       (3.17337, 7.9)
 (3.19447, 0.2)    (3.19447, 0.238693)       (3.19447, 7.9)
 (3.21558, 0.2)    (3.21558, 0.238693)    …  (3.21558, 7.9)
 (3.23668, 0.2)    (3.23668, 0.238693)       (3.23668, 7.9)
 (3.25779, 0.2)    (3.25779, 0.238693)       (3.25779, 7.9)
 (3.27889, 0.2)    (3.27889, 0.238693)       (3.27889, 7.9)
 (3.3, 0.2)        (3.3, 0.238693)           (3.3, 7.9)

Evaluate the grid with the machine before and after oversampling

grid_predictions =[
    predict(mach, Tables.table(reshape(collect(point), 1, 2)))[1] for
 	point in grid_points
 ]
grid_predictions_over = [
    predict(mach_over, Tables.table(reshape(collect(point), 1, 2)))[1] for
    point in grid_points
]

200×200 CategoricalArrays.CategoricalArray{String,2,UInt32}:
 "setosa"  "setosa"  "setosa"  "setosa"  …  "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"  …  "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 ⋮                                       ⋱               
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"  …  "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"
 "setosa"  "setosa"  "setosa"  "setosa"     "virginica"  "virginica"

Make two contour plots using the grid predictions before and after oversampling

p = contourf(petal_length_range, petal_width_range, grid_predictions,
    levels=3, color=:Set3_3, colorbar=false)
p_over = contourf(petal_length_range, petal_width_range, grid_predictions_over,
    levels=3, color=:Set3_3, colorbar=false)
println()

Scatter plot the data before and after oversampling

old_count = size(X, 1)

colors = Dict("setosa" => "green", "versicolor" => "yellow",
	"virginica" => "purple")
labels = unique(y)
for label in labels
	scatter!(p, X.petal_length[y.==label], X.petal_width[y.==label],
		color = colors[label], label = label,
		title = "Before Oversampling")
	scatter!(p_over, X.petal_length[y.==label], X.petal_width[y.==label],
		color = colors[label], label = label,
		title = "After Oversampling")
	# find new points only and plot with different shape
	scatter!(p_over, Xover.petal_length[old_count+1:end][yover[old_count+1:end].==label],
		Xover.petal_width[old_count+1:end][yover[old_count+1:end].==label],
		color = colors[label], label = label*"-over", markershape = :hexagon,
		title = "After Oversampling")
end

plot_res = plot(p, p_over, layout = (1, 2), xlabel = "petal length",
	ylabel = "petal width", size = (900, 300), margin = 5mm, dpi = 200)
savefig(plot_res, "./assets/before-after-smote.png")
println()

Before After SMOTE

Notice how the minority class was completely ignore prior to oversampling. Not all models and hyperparameter settings are this delicate to class imbalance.

Effect of Ratios Hyperparameter

Now let's study the effect of the ratios hyperparameter. We will do this through an animated plot.

anim = @animate for versicolor_ratio ∈ 0.3:0.01:2
	# oversample
	Xover, yover =
		smote(X, y; k = 5, ratios = Dict("versicolor" => versicolor_ratio), rng = 42)

	# fit machine
	model = SVC(gamma = 0.01)
	mach_over = machine(model, Xover, yover)
	fit!(mach_over, verbosity = 0)

	# grid predictions
	grid_predictions_over = [
		predict(mach_over, Tables.table(reshape(collect(point), 1, 2)))[1] for
		point in grid_points
	]
	# plot
	p_over = contourf(petal_length_range, petal_width_range, grid_predictions_over,
		levels = 3, color = :Set3_3, colorbar = false)
	old_count = size(X, 1)
	for label in labels
		scatter!(p_over, X.petal_length[y.==label], X.petal_width[y.==label],
			color = colors[label], label = label,
			title = "Oversampling versicolor with ratio $versicolor_ratio")
		# find new points only and plot with different shape
		scatter!(p_over,
			Xover.petal_length[old_count+1:end][yover[old_count+1:end].==label],
			Xover.petal_width[old_count+1:end][yover[old_count+1:end].==label],
			color = colors[label], label = label * "-over", markershape = :hexagon,
			title = "Oversampling versicolor with ratio $versicolor_ratio")
	end
	plot!(dpi = 150)
end

gif(anim, "./assets/smote-animation.gif", fps=6)
println()

Ratios Parameter Effect

Notice how setting ratios greedily can lead to overfitting.