本章包括下面各节:
12.1 多分类模型的评估方法
12.1.1 综合指标
12.1.2 关于每个标签值的二分类指标
12.1.3 Micro、Macro、Weighted计算的指标
12.2 数据探索
12.3 使用朴素贝叶斯算法进行多分类
12.4 二分类器组合
12.5 Softmax算法
12.6 多层感知器分类器
详细内容请阅读纸质书《Alink权威指南:基于Flink的机器学习实例入门(Python)》,这里为本章对应的示例代码。
from pyalink.alink import *
useLocalEnv(1)
from utils import *
import os
import pandas as pd
pd.set_option('display.max_colwidth', 1000)
DATA_DIR = ROOT_DIR + "iris" + os.sep
ORIGIN_FILE = "iris.data";
TRAIN_FILE = "train.ak";
TEST_FILE = "test.ak";
SCHEMA_STRING = "sepal_length double, sepal_width double, petal_length double, petal_width double, category string"
FEATURE_COL_NAMES = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
LABEL_COL_NAME = "category";
PREDICTION_COL_NAME = "pred";
PRED_DETAIL_COL_NAME = "pred_info";
#c_1
source = CsvSourceBatchOp()\
.setFilePath(DATA_DIR + ORIGIN_FILE)\
.setSchemaStr(SCHEMA_STRING);
source\
.lazyPrint(5, "origin file")\
.lazyPrintStatistics("stat of origin file")\
.link(
CorrelationBatchOp()\
.setSelectedCols(FEATURE_COL_NAMES)\
.lazyPrintCorrelation()
);
source.groupBy(LABEL_COL_NAME, LABEL_COL_NAME + ", COUNT(*) AS cnt").lazyPrint(-1);
BatchOperator.execute();
splitTrainTestIfNotExist(source, DATA_DIR + TRAIN_FILE, DATA_DIR + TEST_FILE, 0.9);
#c_2
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
trainer = NaiveBayesTrainBatchOp()\
.setFeatureCols(FEATURE_COL_NAMES)\
.setLabelCol(LABEL_COL_NAME);
predictor = NaiveBayesPredictBatchOp()\
.setPredictionCol(PREDICTION_COL_NAME)\
.setPredictionDetailCol(PRED_DETAIL_COL_NAME);
train_data.link(trainer);
predictor.linkFrom(trainer, test_data);
trainer.lazyPrintModelInfo();
predictor.lazyPrint(1, "< Prediction >");
predictor\
.link(
EvalMultiClassBatchOp()\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionCol(PREDICTION_COL_NAME)\
.setPredictionDetailCol(PRED_DETAIL_COL_NAME)\
.lazyPrintMetrics("NaiveBayes")
);
BatchOperator.execute();
#c_3
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
OneVsRest()\
.setClassifier(
LogisticRegression()\
.setFeatureCols(FEATURE_COL_NAMES)\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionCol(PREDICTION_COL_NAME)\
)\
.setNumClass(3)\
.fit(train_data)\
.transform(test_data)\
.link(
EvalMultiClassBatchOp()\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionCol(PREDICTION_COL_NAME)\
.lazyPrintMetrics("OneVsRest_LogisticRegression")
);
OneVsRest()\
.setClassifier(
GbdtClassifier()\
.setFeatureCols(FEATURE_COL_NAMES)\
.setLabelCol(LABEL_COL_NAME)
.setPredictionCol(PREDICTION_COL_NAME)\
)\
.setNumClass(3)\
.fit(train_data)\
.transform(test_data)\
.link(
EvalMultiClassBatchOp()\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionCol(PREDICTION_COL_NAME)\
.lazyPrintMetrics("OneVsRest_GBDT")
);
OneVsRest()\
.setClassifier(
LinearSvm()\
.setFeatureCols(FEATURE_COL_NAMES)\
.setLabelCol(LABEL_COL_NAME)
.setPredictionCol(PREDICTION_COL_NAME)\
)\
.setNumClass(3)\
.fit(train_data)\
.transform(test_data)\
.link(
EvalMultiClassBatchOp()\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionCol(PREDICTION_COL_NAME)\
.lazyPrintMetrics("OneVsRest_LinearSvm")
);
BatchOperator.execute()
#c_4
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
Softmax()\
.setFeatureCols(FEATURE_COL_NAMES)\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionCol(PREDICTION_COL_NAME)\
.enableLazyPrintTrainInfo()\
.enableLazyPrintModelInfo()\
.fit(train_data)\
.transform(test_data)\
.link(
EvalMultiClassBatchOp()\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionCol(PREDICTION_COL_NAME)\
.lazyPrintMetrics("Softmax")
);
BatchOperator.execute();
#c_5
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
MultilayerPerceptronClassifier()\
.setLayers([4, 12, 3])\
.setFeatureCols(FEATURE_COL_NAMES)\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionCol(PREDICTION_COL_NAME)\
.fit(train_data)\
.transform(test_data)\
.link(
EvalMultiClassBatchOp()\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionCol(PREDICTION_COL_NAME)\
.lazyPrintMetrics("MultilayerPerceptronClassifier [4, 12, 3]")
);
MultilayerPerceptronClassifier()\
.setLayers([4, 3])\
.setFeatureCols(FEATURE_COL_NAMES)\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionCol(PREDICTION_COL_NAME)\
.fit(train_data)\
.transform(test_data)\
.link(
EvalMultiClassBatchOp()\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionCol(PREDICTION_COL_NAME)\
.lazyPrintMetrics("MultilayerPerceptronClassifier [4, 3]")
);
BatchOperator.execute();
```python
```