Creating an end-to-end workflow¶

Let create an end-to-end machine learning workflow.

Importing the required packages

import numpy as np

from pjautoml.cs.operator.free.chain import Chain
from pjautoml.cs.operator.free.map import Map
from pjautoml.cs.operator.free.select import Select
from pjautoml.cs.operator.free.shuffle import Shuffle
from pjautoml.cs.workflow import Workflow
from pjml.data.communication.report import Report
from pjml.data.evaluation.metric import Metric
from pjml.data.flow.file import File
from pjml.stream.expand.partition import Partition
from pjml.stream.reduce.reduce import Reduce
from pjml.stream.reduce.summ import Summ
from pjpy.modeling.supervised.classifier.dt import DT
from pjpy.modeling.supervised.classifier.svmc import SVMC
from pjpy.processing.feature.reductor.pca import PCA
from pjpy.processing.feature.scaler.minmax import MinMax

np.random.seed(0)

First, we create a machine learning expression.

exp = Chain(Shuffle(PCA, MinMax), Select(SVMC + DT))

It represents a configuration space. Let’s get a sample from it.

print(exp.sample())

Out:

{
    "info": {
        "_id": "MinMax@pjpy.processing.feature.scaler.minmax",
        "config": {
            "feature_range": [
                0,
                1
            ]
        }
    },
    "enhance": true,
    "model": true
}
{
    "info": {
        "_id": "PCA@pjpy.processing.feature.reductor.pca",
        "config": {
            "n": 0.7151893663724195
        }
    },
    "enhance": true,
    "model": true
}
{
    "info": {
        "_id": "DT@pjpy.modeling.supervised.classifier.dt",
        "config": {
            "criterion": "entropy",
            "splitter": "best",
            "class_weight": "balanced",
            "max_features": null,
            "max_depth": 425,
            "min_samples_split": 0.1937685880258838,
            "min_samples_leaf": 0.1312767257915965,
            "min_weight_fraction_leaf": 0.2675319002346239,
            "min_impurity_decrease": 0.19273255210020587,
            "seed": 0
        }
    },
    "enhance": true,
    "model": true
}

Defined our machine learning expression, we will create an end-to-end workflow.

workflow = Workflow(
    File("../data/iris.arff"),
    Partition(),
    Map(exp, Metric()),
    Summ(function="mean"),
    Reduce(),
    Report("Mean S: $S"),
)

or using only python operators

workflow = (
    File("../data/iris.arff")
    * Partition()
    * Map(exp * Metric())
    * Summ(function="mean")
    * Reduce()
    * Report("Mean S: $S")
)

This workflow represents the union of all configuration spaces. Let get a sample of it:

spl = workflow.sample()
print(spl)

Out:

{
    "info": {
        "_id": "File@pjml.data.flow.file",
        "config": {
            "name": "../data/iris.arff",
            "path": "./",
            "description": "No description.",
            "hashes": {
                "X": "0ǏǍɽĊũÊүȏŵҖSîҕ",
                "Y": "0ЄϒɐĵǏȂϗƽўýÎʃȆ",
                "Xd": "5ɫңɖŇǓήŉÝʑΏƀЀǔ",
                "Yd": "5mϛǖͶƅĞOȁЎžʛѲƨ",
                "Xt": "5ȥΔĨӑËҭȨƬδſΧȰɩ",
                "Yt": "5έēPaӹЄźգǩȱɟǟǹ"
            }
        }
    },
    "enhance": true,
    "model": true
}
{
    "info": {
        "_id": "Partition@pjml.stream.expand.partition",
        "config": {
            "split_type": "cv",
            "partitions": 10,
            "seed": 0,
            "fields": "X,Y"
        }
    },
    "enhance": true,
    "model": true
}
Map>>
    {"info": {"_id": "Pipeline@pjml.operator.pipeline","config": {"components": [{"info": {"_id": "MinMax@pjpy.processing.feature.scaler.minmax","config": {"feature_range": [0,1],"model": true,"enhance": true}},"enhance": true,"model": true},{"info": {"_id": "PCA@pjpy.processing.feature.reductor.pca","config": {"n": 0.7917250380826646,"model": true,"enhance": true}},"enhance": true,"model": true}],"model": true,"enhance": true}},"enhance": true,"model": true}
    {"info": {"_id": "DT@pjpy.modeling.supervised.classifier.dt","config": {"criterion": "gini","splitter": "best","class_weight": null,"max_features": "sqrt","max_depth": 926,"min_samples_split": 0.021311746423307888,"min_samples_leaf": 0.026139702781162514,"min_weight_fraction_leaf": 0.006065519232097715,"min_impurity_decrease": 0.1665239691095876,"seed": 0,"model": true,"enhance": true}},"enhance": true,"model": true}
    {"info": {"_id": "Metric@pjml.data.evaluation.metric","config": {"functions": ["accuracy"],"target": "Y","prediction": "Z","model": true,"enhance": true}},"enhance": true,"model": true}
<<Map
{
    "info": {
        "_id": "Summ@pjml.stream.reduce.summ",
        "config": {
            "field": "R",
            "function": "mean"
        }
    },
    "enhance": true,
    "model": true
}
{
    "info": {
        "_id": "Reduce@pjml.stream.reduce.reduce",
        "config": {}
    },
    "enhance": true,
    "model": true
}
{
    "info": {
        "_id": "Report@pjml.data.communication.report",
        "config": {
            "text": "Mean S: $S"
        }
    },
    "enhance": true,
    "model": true
}

Total running time of the script: ( 0 minutes 0.041 seconds)

Gallery generated by Sphinx-Gallery