Creating an end-to-end AutoML from scratch

Let’s create an AutoML from scratch.

Importing the required packages

import numpy as np

from pjautoml.cs.operator.datadriven.optimization.modelfree.random import RandomSearch
from pjautoml.cs.operator.free.map import Map
from pjautoml.cs.operator.free.select import Select
from pjautoml.cs.operator.free.shuffle import Shuffle
from pjautoml.cs.workflow import Workflow
from pjml.data.communication.report import Report
from pjml.data.evaluation.metric import Metric
from pjml.data.flow.file import File
from pjml.stream.expand.partition import Partition
from pjml.stream.reduce.reduce import Reduce
from pjml.stream.reduce.summ import Summ
from pjpy.modeling.supervised.classifier.dt import DT
from pjpy.modeling.supervised.classifier.svmc import SVMC
from pjpy.processing.feature.reductor.pca import PCA
from pjpy.processing.feature.scaler.minmax import MinMax

np.random.seed(0)

First, we can define a workflow. Notice we not add a File. Then, we use random search as the optimization process to select the best pipeline. Finally, we should also give it a name. The name of my AutoML, of course, is my_automl :)

def my_automl(data):
    workflow = Workflow(
        Partition(),
        Map(Shuffle(PCA, MinMax), Select(SVMC + DT), Metric()),
        Summ(function="mean"),
        Reduce(),
        Report("Mean S: $S"),
    )

    rs = RandomSearch(workflow, sample=30, train=data, test=data)
    return rs.components[0]

Now, let’s find a good pipeline for the iris dataset:

data = File("../data/iris.arff").data
best_pipeline = my_automl(data)
print(best_pipeline)

Out:

   [model]  Mean S: array([[0.96666667]])
   [model]  Mean S: array([[0.96666667]])
   [model]  Mean S: array([[0.33333333]])
   [model]  Mean S: array([[0.96666667]])
   [model]  Mean S: array([[0.33333333]])
   [model]  Mean S: array([[0.94]])
   [model]  Mean S: array([[0.93333333]])
   [model]  Mean S: array([[0.94666667]])
   [model]  Mean S: array([[0.97333333]])
   [model]  Mean S: array([[0.38]])
   [model]  Mean S: array([[0.00666667]])
   [model]  Mean S: array([[0.96666667]])
   [model]  Mean S: array([[0.33333333]])
   [model]  Mean S: array([[0.95333333]])
   [model]  Mean S: array([[0.94]])
   [model]  Mean S: array([[0.33333333]])
   [model]  Mean S: array([[0.94666667]])
   [model]  Mean S: array([[0.96]])
   [model]  Mean S: array([[0.33333333]])
   [model]  Mean S: array([[0.94666667]])
   [model]  Mean S: array([[0.96666667]])
   [model]  Mean S: array([[0.93333333]])
   [model]  Mean S: array([[0.96666667]])
   [model]  Mean S: array([[0.96666667]])
   [model]  Mean S: array([[0.94666667]])
   [model]  Mean S: array([[0.96]])
   [model]  Mean S: array([[0.95333333]])
   [model]  Mean S: array([[0.33333333]])
   [model]  Mean S: array([[0.96]])
   [model]  Mean S: array([[0.94]])
{
    "info": {
        "_id": "Partition@pjml.stream.expand.partition",
        "config": {
            "split_type": "cv",
            "partitions": 10,
            "seed": 0,
            "fields": "X,Y"
        }
    },
    "enhance": true,
    "model": true
}
Map>>
    {"info": {"_id": "Pipeline@pjml.operator.pipeline","config": {"components": [{"info": {"_id": "MinMax@pjpy.processing.feature.scaler.minmax","config": {"feature_range": [-1,1],"model": true,"enhance": true}},"enhance": true,"model": true},{"info": {"_id": "PCA@pjpy.processing.feature.reductor.pca","config": {"n": 0.9764594650133958,"model": true,"enhance": true}},"enhance": true,"model": true}],"model": true,"enhance": true}},"enhance": true,"model": true}
    {"info": {"_id": "SVMC@pjpy.modeling.supervised.classifier.svmc","config": {"C": 97.6761111429249,"kernel": "linear","degree": 3,"gamma": "scale","coef0": 0.0,"shrinking": false,"probability": false,"tol": 0.1,"cache_size": 200,"class_weight": "balanced","verbose": false,"max_iter": 1000000,"decision_function_shape": "ovo","break_ties": false,"random_state": null,"seed": 0,"model": true,"enhance": true}},"enhance": true,"model": true}
    {"info": {"_id": "Metric@pjml.data.evaluation.metric","config": {"functions": ["accuracy"],"target": "Y","prediction": "Z","model": true,"enhance": true}},"enhance": true,"model": true}
<<Map
{
    "info": {
        "_id": "Summ@pjml.stream.reduce.summ",
        "config": {
            "field": "R",
            "function": "mean"
        }
    },
    "enhance": true,
    "model": true
}
{
    "info": {
        "_id": "Reduce@pjml.stream.reduce.reduce",
        "config": {}
    },
    "enhance": true,
    "model": true
}
{
    "info": {
        "_id": "Report@pjml.data.communication.report",
        "config": {
            "text": "Mean S: $S"
        }
    },
    "enhance": true,
    "model": true
}

Total running time of the script: ( 0 minutes 3.250 seconds)

Gallery generated by Sphinx-Gallery