Note
Click here to download the full example code
Creating an end-to-end AutoML from scratch¶
Let’s create an AutoML from scratch.
Importing the required packages
import numpy as np
from pjautoml.cs.operator.datadriven.optimization.modelfree.random import RandomSearch
from pjautoml.cs.operator.free.map import Map
from pjautoml.cs.operator.free.select import Select
from pjautoml.cs.operator.free.shuffle import Shuffle
from pjautoml.cs.workflow import Workflow
from pjml.data.communication.report import Report
from pjml.data.evaluation.metric import Metric
from pjml.data.flow.file import File
from pjml.stream.expand.partition import Partition
from pjml.stream.reduce.reduce import Reduce
from pjml.stream.reduce.summ import Summ
from pjpy.modeling.supervised.classifier.dt import DT
from pjpy.modeling.supervised.classifier.svmc import SVMC
from pjpy.processing.feature.reductor.pca import PCA
from pjpy.processing.feature.scaler.minmax import MinMax
np.random.seed(0)
First, we can define a workflow. Notice we not add a File
. Then, we use
random search as the optimization process to select the best pipeline.
Finally, we should also give it a name. The name of my AutoML, of course, is
my_automl
:)
Now, let’s find a good pipeline for the iris dataset:
data = File("../data/iris.arff").data
best_pipeline = my_automl(data)
print(best_pipeline)
Out:
[model] Mean S: array([[0.96666667]])
[model] Mean S: array([[0.96666667]])
[model] Mean S: array([[0.33333333]])
[model] Mean S: array([[0.96666667]])
[model] Mean S: array([[0.33333333]])
[model] Mean S: array([[0.94]])
[model] Mean S: array([[0.93333333]])
[model] Mean S: array([[0.94666667]])
[model] Mean S: array([[0.97333333]])
[model] Mean S: array([[0.38]])
[model] Mean S: array([[0.00666667]])
[model] Mean S: array([[0.96666667]])
[model] Mean S: array([[0.33333333]])
[model] Mean S: array([[0.95333333]])
[model] Mean S: array([[0.94]])
[model] Mean S: array([[0.33333333]])
[model] Mean S: array([[0.94666667]])
[model] Mean S: array([[0.96]])
[model] Mean S: array([[0.33333333]])
[model] Mean S: array([[0.94666667]])
[model] Mean S: array([[0.96666667]])
[model] Mean S: array([[0.93333333]])
[model] Mean S: array([[0.96666667]])
[model] Mean S: array([[0.96666667]])
[model] Mean S: array([[0.94666667]])
[model] Mean S: array([[0.96]])
[model] Mean S: array([[0.95333333]])
[model] Mean S: array([[0.33333333]])
[model] Mean S: array([[0.96]])
[model] Mean S: array([[0.94]])
{
"info": {
"_id": "Partition@pjml.stream.expand.partition",
"config": {
"split_type": "cv",
"partitions": 10,
"seed": 0,
"fields": "X,Y"
}
},
"enhance": true,
"model": true
}
Map>>
{"info": {"_id": "Pipeline@pjml.operator.pipeline","config": {"components": [{"info": {"_id": "MinMax@pjpy.processing.feature.scaler.minmax","config": {"feature_range": [-1,1],"model": true,"enhance": true}},"enhance": true,"model": true},{"info": {"_id": "PCA@pjpy.processing.feature.reductor.pca","config": {"n": 0.9764594650133958,"model": true,"enhance": true}},"enhance": true,"model": true}],"model": true,"enhance": true}},"enhance": true,"model": true}
{"info": {"_id": "SVMC@pjpy.modeling.supervised.classifier.svmc","config": {"C": 97.6761111429249,"kernel": "linear","degree": 3,"gamma": "scale","coef0": 0.0,"shrinking": false,"probability": false,"tol": 0.1,"cache_size": 200,"class_weight": "balanced","verbose": false,"max_iter": 1000000,"decision_function_shape": "ovo","break_ties": false,"random_state": null,"seed": 0,"model": true,"enhance": true}},"enhance": true,"model": true}
{"info": {"_id": "Metric@pjml.data.evaluation.metric","config": {"functions": ["accuracy"],"target": "Y","prediction": "Z","model": true,"enhance": true}},"enhance": true,"model": true}
<<Map
{
"info": {
"_id": "Summ@pjml.stream.reduce.summ",
"config": {
"field": "R",
"function": "mean"
}
},
"enhance": true,
"model": true
}
{
"info": {
"_id": "Reduce@pjml.stream.reduce.reduce",
"config": {}
},
"enhance": true,
"model": true
}
{
"info": {
"_id": "Report@pjml.data.communication.report",
"config": {
"text": "Mean S: $S"
}
},
"enhance": true,
"model": true
}
Total running time of the script: ( 0 minutes 3.250 seconds)