Alink教程(Python版)

第12章 从二分类到多分类

本章包括下面各节:
12.1 多分类模型的评估方法
12.1.1 综合指标
12.1.2 关于每个标签值的二分类指标
12.1.3 Micro、Macro、Weighted计算的指标
12.2 数据探索
12.3 使用朴素贝叶斯算法进行多分类
12.4 二分类器组合
12.5 Softmax算法
12.6 多层感知器分类器

详细内容请阅读纸质书《Alink权威指南:基于Flink的机器学习实例入门(Python)》,这里为本章对应的示例代码。

from pyalink.alink import *
useLocalEnv(1)

from utils import *
import os
import pandas as pd

pd.set_option('display.max_colwidth', 1000)

DATA_DIR = ROOT_DIR + "iris" + os.sep

ORIGIN_FILE = "iris.data";

TRAIN_FILE = "train.ak";
TEST_FILE = "test.ak";

SCHEMA_STRING = "sepal_length double, sepal_width double, petal_length double, petal_width double, category string"

FEATURE_COL_NAMES = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

LABEL_COL_NAME = "category";

PREDICTION_COL_NAME = "pred";
PRED_DETAIL_COL_NAME = "pred_info";

#c_1
source = CsvSourceBatchOp()\
    .setFilePath(DATA_DIR + ORIGIN_FILE)\
    .setSchemaStr(SCHEMA_STRING);

source\
    .lazyPrint(5, "origin file")\
    .lazyPrintStatistics("stat of origin file")\
    .link(
        CorrelationBatchOp()\
            .setSelectedCols(FEATURE_COL_NAMES)\
            .lazyPrintCorrelation()
    );

source.groupBy(LABEL_COL_NAME, LABEL_COL_NAME + ", COUNT(*) AS cnt").lazyPrint(-1);

BatchOperator.execute();

splitTrainTestIfNotExist(source, DATA_DIR + TRAIN_FILE, DATA_DIR + TEST_FILE, 0.9);

#c_2
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);

trainer = NaiveBayesTrainBatchOp()\
    .setFeatureCols(FEATURE_COL_NAMES)\
    .setLabelCol(LABEL_COL_NAME);

predictor = NaiveBayesPredictBatchOp()\
    .setPredictionCol(PREDICTION_COL_NAME)\
    .setPredictionDetailCol(PRED_DETAIL_COL_NAME);

train_data.link(trainer);

predictor.linkFrom(trainer, test_data);

trainer.lazyPrintModelInfo();

predictor.lazyPrint(1, "< Prediction >");

predictor\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\
            .lazyPrintMetrics("NaiveBayes")
    );

BatchOperator.execute();
#c_3
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);

OneVsRest()\
    .setClassifier(
        LogisticRegression()\
            .setFeatureCols(FEATURE_COL_NAMES)\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
    )\
    .setNumClass(3)\
    .fit(train_data)\
    .transform(test_data)\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .lazyPrintMetrics("OneVsRest_LogisticRegression")
    );

OneVsRest()\
    .setClassifier(
        GbdtClassifier()\
            .setFeatureCols(FEATURE_COL_NAMES)\
            .setLabelCol(LABEL_COL_NAME)
            .setPredictionCol(PREDICTION_COL_NAME)\
    )\
    .setNumClass(3)\
    .fit(train_data)\
    .transform(test_data)\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .lazyPrintMetrics("OneVsRest_GBDT")
    );

OneVsRest()\
    .setClassifier(
        LinearSvm()\
            .setFeatureCols(FEATURE_COL_NAMES)\
            .setLabelCol(LABEL_COL_NAME)
            .setPredictionCol(PREDICTION_COL_NAME)\
    )\
    .setNumClass(3)\
    .fit(train_data)\
    .transform(test_data)\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .lazyPrintMetrics("OneVsRest_LinearSvm")
    );

BatchOperator.execute()
#c_4
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);

Softmax()\
    .setFeatureCols(FEATURE_COL_NAMES)\
    .setLabelCol(LABEL_COL_NAME)\
    .setPredictionCol(PREDICTION_COL_NAME)\
    .enableLazyPrintTrainInfo()\
    .enableLazyPrintModelInfo()\
    .fit(train_data)\
    .transform(test_data)\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .lazyPrintMetrics("Softmax")
    );

BatchOperator.execute();

#c_5
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);

MultilayerPerceptronClassifier()\
    .setLayers([4, 12, 3])\
    .setFeatureCols(FEATURE_COL_NAMES)\
    .setLabelCol(LABEL_COL_NAME)\
    .setPredictionCol(PREDICTION_COL_NAME)\
    .fit(train_data)\
    .transform(test_data)\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .lazyPrintMetrics("MultilayerPerceptronClassifier [4, 12, 3]")
    );

MultilayerPerceptronClassifier()\
    .setLayers([4, 3])\
    .setFeatureCols(FEATURE_COL_NAMES)\
    .setLabelCol(LABEL_COL_NAME)\
    .setPredictionCol(PREDICTION_COL_NAME)\
    .fit(train_data)\
    .transform(test_data)\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .lazyPrintMetrics("MultilayerPerceptronClassifier [4, 3]")
    );

BatchOperator.execute();