本章包括下面各节:
9.1 朴素贝叶斯模型
9.2 决策树模型
9.2.1 决策树的分裂指标定义
9.2.2 常用的决策树算法
9.2.3 指标计算示例
9.2.4 分类树与回归树
9.2.5 经典的决策树示例
9.3 数据探索
9.4 使用朴素贝叶斯方法
9.5 蘑菇分类的决策树
详细内容请阅读纸质书《Alink权威指南:基于Flink的机器学习实例入门(Python)》,这里为本章对应的示例代码。
from pyalink.alink import *
useLocalEnv(1)
from utils import *
import os
import pandas as pd
DATA_DIR = ROOT_DIR + "mushroom" + os.sep
ORIGIN_FILE = "agaricus-lepiota.data"
TRAIN_FILE = "train.ak"
TEST_FILE = "test.ak"
COL_NAMES = [
"class",
"cap_shape", "cap_surface", "cap_color", "bruises", "odor",
"gill_attachment", "gill_spacing", "gill_size", "gill_color",
"stalk_shape", "stalk_root", "stalk_surface_above_ring", "stalk_surface_below_ring",
"stalk_color_above_ring", "stalk_color_below_ring",
"veil_type", "veil_color",
"ring_number", "ring_type", "spore_print_color", "population", "habitat"
]
COL_TYPES = [
"string",
"string", "string", "string", "string", "string",
"string", "string", "string", "string", "string",
"string", "string", "string", "string", "string",
"string", "string", "string", "string", "string",
"string", "string"
]
LABEL_COL_NAME = "class"
FEATURE_COL_NAMES = COL_NAMES.copy()
FEATURE_COL_NAMES.remove(LABEL_COL_NAME)
PREDICTION_COL_NAME = "pred"
PRED_DETAIL_COL_NAME = "predInfo"
#c_1
source = CsvSourceBatchOp()\
.setFilePath(DATA_DIR + ORIGIN_FILE)\
.setSchemaStr(generateSchemaString(COL_NAMES, COL_TYPES))
source.lazyPrint(5, "< origin data >")
splitTrainTestIfNotExist(source, DATA_DIR + TRAIN_FILE, DATA_DIR + TEST_FILE, 0.9)
AkSourceBatchOp()\
.setFilePath(DATA_DIR + TRAIN_FILE)\
.link(
ChiSqSelectorBatchOp()\
.setSelectorType("NumTopFeatures")\
.setNumTopFeatures(3)\
.setSelectedCols(FEATURE_COL_NAMES)\
.setLabelCol(LABEL_COL_NAME)\
.lazyPrintModelInfo("< Chi-Square Selector >")
)
AkSourceBatchOp()\
.setFilePath(DATA_DIR + TRAIN_FILE)\
.select("veil_type")\
.distinct()\
.lazyPrint(100)
BatchOperator.execute()
#c_2_1
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
trainer = NaiveBayesTrainBatchOp()\
.setFeatureCols(FEATURE_COL_NAMES)\
.setCategoricalCols(FEATURE_COL_NAMES)\
.setLabelCol(LABEL_COL_NAME)
predictor = NaiveBayesPredictBatchOp()\
.setPredictionCol(PREDICTION_COL_NAME)\
.setPredictionDetailCol(PRED_DETAIL_COL_NAME)
train_data.link(trainer);
predictor.linkFrom(trainer, test_data);
trainer.lazyPrintModelInfo();
def print_model_info(naiveBayesModelInfo: NaiveBayesModelInfo):
for feature in ["odor", "spore_print_color", "gill_color"]:
print("feature: " + feature)
print(naiveBayesModelInfo.getCategoryFeatureInfo().get(feature))
trainer.lazyCollectModelInfo(print_model_info)
predictor.lazyPrint(10, "< Prediction >");
predictor\
.link(
EvalBinaryClassBatchOp()\
.setPositiveLabelValueString("p")\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionDetailCol(PRED_DETAIL_COL_NAME)\
.lazyPrintMetrics()
)
BatchOperator.execute()
#c_2_2
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE)
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE)
trainer = NaiveBayesTrainBatchOp()\
.setFeatureCols(["odor", "gill_color"])\
.setCategoricalCols(["odor", "gill_color"])\
.setLabelCol(LABEL_COL_NAME);
predictor = NaiveBayesPredictBatchOp()\
.setPredictionCol(PREDICTION_COL_NAME)\
.setPredictionDetailCol(PRED_DETAIL_COL_NAME);
train_data.link(trainer);
predictor.linkFrom(trainer, test_data);
def print_model_info(naiveBayesModelInfo: NaiveBayesModelInfo):
for feature in ["odor", "gill_color"]:
print("feature: " + feature)
print(naiveBayesModelInfo.getCategoryFeatureInfo().get(feature))
trainer.lazyCollectModelInfo(print_model_info);
predictor\
.lazyPrint(10, "< Prediction >")\
.link(
EvalBinaryClassBatchOp()\
.setPositiveLabelValueString("p")\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionDetailCol(PRED_DETAIL_COL_NAME)\
.lazyPrintMetrics()
);
BatchOperator.execute();
#c_3_1
df = pd.DataFrame(
[
["sunny", 85.0, 85.0, False, "no"],
["sunny", 80.0, 90.0, True, "no"],
["overcast", 83.0, 78.0, False, "yes"],
["rainy", 70.0, 96.0, False, "yes"],
["rainy", 68.0, 80.0, False, "yes"],
["rainy", 65.0, 70.0, True, "no"],
["overcast", 64.0, 65.0, True, "yes"],
["sunny", 72.0, 95.0, False, "no"],
["sunny", 69.0, 70.0, False, "yes"],
["rainy", 75.0, 80.0, False, "yes"],
["sunny", 75.0, 70.0, True, "yes"],
["overcast", 72.0, 90.0, True, "yes"],
["overcast", 81.0, 75.0, False, "yes"],
["rainy", 71.0, 80.0, True, "no"]
]
)
source = BatchOperator.fromDataframe(df, schemaStr="Outlook string, Temperature double, Humidity double, Windy boolean, Play string")
source.lazyPrint(-1);
source\
.link(
C45TrainBatchOp()\
.setFeatureCols(["Outlook", "Temperature", "Humidity", "Windy"])\
.setCategoricalCols(["Outlook", "Windy"])\
.setLabelCol("Play")\
.lazyPrintModelInfo()\
.lazyCollectModelInfo(
lambda decisionTreeModelInfo:
decisionTreeModelInfo.saveTreeAsImage(
DATA_DIR + "weather_tree_model.png", True)
)
);
BatchOperator.execute();
#c_3_2
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
for treeType in ['GINI', 'INFOGAIN', 'INFOGAINRATIO'] :
model = train_data.link(
DecisionTreeTrainBatchOp()\
.setTreeType(treeType)\
.setFeatureCols(FEATURE_COL_NAMES)\
.setCategoricalCols(FEATURE_COL_NAMES)\
.setLabelCol(LABEL_COL_NAME)\
.lazyPrintModelInfo("< " + treeType + " >")\
.lazyCollectModelInfo(
lambda decisionTreeModelInfo:
decisionTreeModelInfo.saveTreeAsImage(
DATA_DIR + "tree_" + treeType + ".jpg", True)
)
);
predictor = DecisionTreePredictBatchOp()\
.setPredictionCol(PREDICTION_COL_NAME)\
.setPredictionDetailCol(PRED_DETAIL_COL_NAME);
predictor.linkFrom(model, test_data);
predictor.link(
EvalBinaryClassBatchOp()\
.setPositiveLabelValueString("p")\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionDetailCol(PRED_DETAIL_COL_NAME)\
.lazyPrintMetrics("< " + treeType + " >")
)
BatchOperator.execute()
```python
```