本章包括下面各节:
12.1 多分类模型的评估方法
12.1.1 综合指标
12.1.2 关于每个标签值的二分类指标
12.1.3 Micro、Macro、Weighted计算的指标
12.2 数据探索
12.3 使用朴素贝叶斯算法进行多分类
12.4 二分类器组合
12.5 Softmax算法
12.6 多层感知器分类器
详细内容请阅读纸质书《Alink权威指南:基于Flink的机器学习实例入门(Python)》,这里为本章对应的示例代码。
from pyalink.alink import * useLocalEnv(1) from utils import * import os import pandas as pd pd.set_option('display.max_colwidth', 1000) DATA_DIR = ROOT_DIR + "iris" + os.sep ORIGIN_FILE = "iris.data"; TRAIN_FILE = "train.ak"; TEST_FILE = "test.ak"; SCHEMA_STRING = "sepal_length double, sepal_width double, petal_length double, petal_width double, category string" FEATURE_COL_NAMES = ["sepal_length", "sepal_width", "petal_length", "petal_width"] LABEL_COL_NAME = "category"; PREDICTION_COL_NAME = "pred"; PRED_DETAIL_COL_NAME = "pred_info";
#c_1 source = CsvSourceBatchOp()\ .setFilePath(DATA_DIR + ORIGIN_FILE)\ .setSchemaStr(SCHEMA_STRING); source\ .lazyPrint(5, "origin file")\ .lazyPrintStatistics("stat of origin file")\ .link( CorrelationBatchOp()\ .setSelectedCols(FEATURE_COL_NAMES)\ .lazyPrintCorrelation() ); source.groupBy(LABEL_COL_NAME, LABEL_COL_NAME + ", COUNT(*) AS cnt").lazyPrint(-1); BatchOperator.execute(); splitTrainTestIfNotExist(source, DATA_DIR + TRAIN_FILE, DATA_DIR + TEST_FILE, 0.9);
#c_2 train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE); test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE); trainer = NaiveBayesTrainBatchOp()\ .setFeatureCols(FEATURE_COL_NAMES)\ .setLabelCol(LABEL_COL_NAME); predictor = NaiveBayesPredictBatchOp()\ .setPredictionCol(PREDICTION_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME); train_data.link(trainer); predictor.linkFrom(trainer, test_data); trainer.lazyPrintModelInfo(); predictor.lazyPrint(1, "< Prediction >"); predictor\ .link( EvalMultiClassBatchOp()\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\ .lazyPrintMetrics("NaiveBayes") ); BatchOperator.execute();
#c_3 train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE); test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE); OneVsRest()\ .setClassifier( LogisticRegression()\ .setFeatureCols(FEATURE_COL_NAMES)\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ )\ .setNumClass(3)\ .fit(train_data)\ .transform(test_data)\ .link( EvalMultiClassBatchOp()\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .lazyPrintMetrics("OneVsRest_LogisticRegression") ); OneVsRest()\ .setClassifier( GbdtClassifier()\ .setFeatureCols(FEATURE_COL_NAMES)\ .setLabelCol(LABEL_COL_NAME) .setPredictionCol(PREDICTION_COL_NAME)\ )\ .setNumClass(3)\ .fit(train_data)\ .transform(test_data)\ .link( EvalMultiClassBatchOp()\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .lazyPrintMetrics("OneVsRest_GBDT") ); OneVsRest()\ .setClassifier( LinearSvm()\ .setFeatureCols(FEATURE_COL_NAMES)\ .setLabelCol(LABEL_COL_NAME) .setPredictionCol(PREDICTION_COL_NAME)\ )\ .setNumClass(3)\ .fit(train_data)\ .transform(test_data)\ .link( EvalMultiClassBatchOp()\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .lazyPrintMetrics("OneVsRest_LinearSvm") ); BatchOperator.execute()
#c_4 train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE); test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE); Softmax()\ .setFeatureCols(FEATURE_COL_NAMES)\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .enableLazyPrintTrainInfo()\ .enableLazyPrintModelInfo()\ .fit(train_data)\ .transform(test_data)\ .link( EvalMultiClassBatchOp()\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .lazyPrintMetrics("Softmax") ); BatchOperator.execute();
#c_5 train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE); test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE); MultilayerPerceptronClassifier()\ .setLayers([4, 12, 3])\ .setFeatureCols(FEATURE_COL_NAMES)\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .fit(train_data)\ .transform(test_data)\ .link( EvalMultiClassBatchOp()\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .lazyPrintMetrics("MultilayerPerceptronClassifier [4, 12, 3]") ); MultilayerPerceptronClassifier()\ .setLayers([4, 3])\ .setFeatureCols(FEATURE_COL_NAMES)\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .fit(train_data)\ .transform(test_data)\ .link( EvalMultiClassBatchOp()\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .lazyPrintMetrics("MultilayerPerceptronClassifier [4, 3]") ); BatchOperator.execute();