本章包括下面各节:
8.1 线性模型的基础知识
8.1.1 损失函数
8.1.2 经验风险函数与结构风险函数
8.1.3 线性模型与损失函数
8.1.4 逻辑回归与线性支持向量机(Linear SVM)
8.2 二分类评估方法
8.2.1 基本指标
8.2.2 综合指标
8.2.3 评估曲线
8.3 数据探索
8.3.1 基本统计
8.3.2 相关性
8.4 训练集和测试集
8.5 逻辑回归模型
8.6 线性支持向量机模型
8.7 模型评估
8.8 特征的多项式扩展
8.9 因子分解机
详细内容请阅读纸质书《Alink权威指南:基于Flink的机器学习实例入门(Python)》,这里为本章对应的示例代码。
from pyalink.alink import * useLocalEnv(1) from utils import * import os import pandas as pd pd.set_option('display.max_colwidth', 1000) DATA_DIR = ROOT_DIR + "banknote" + os.sep ORIGIN_FILE = "data_banknote_authentication.txt" SCHEMA_STRING = "variance double, skewness double, kurtosis double, entropy double, class int" TRAIN_FILE = "train.ak" TEST_FILE = "test.ak" LR_PRED_FILE = "lr_pred.ak" SVM_PRED_FILE = "svm_pred.ak" FEATURE_COL_NAMES = ["variance", "skewness", "kurtosis", "entropy"] LABEL_COL_NAME = "class" VEC_COL_NAME = "vec" PREDICTION_COL_NAME = "pred" PRED_DETAIL_COL_NAME = "predinfo"
#c_1 source = CsvSourceBatchOp()\ .setFilePath(DATA_DIR + ORIGIN_FILE)\ .setSchemaStr(SCHEMA_STRING) print("column names of source:") print(source.getColNames()) print("column types of source:") print(source.getColTypes()) source.firstN(5).print()
#c_1_1 source = CsvSourceBatchOp()\ .setFilePath(DATA_DIR + ORIGIN_FILE)\ .setSchemaStr(SCHEMA_STRING) summary = SummarizerBatchOp().linkFrom(source).collectSummary() print("Count of data set : " + str(summary.count())) print("Max value of entropy : " + str(summary.max("entropy"))) print(summary) source.link( SummarizerBatchOp()\ .lazyCollectSummary( lambda tableSummary:( print("Count of data set : " + str(tableSummary.count())), print("Max value of entropy : " + str(tableSummary.max("entropy"))), print(tableSummary) ) ) ) source.link( SummarizerBatchOp().lazyPrintSummary() ) source\ .lazyPrintStatistics("<- origin data ->")\ .firstN(5)\ .lazyPrintStatistics("<- first 5 data ->")\ .print()
#c_1_2 source = CsvSourceBatchOp()\ .setFilePath(DATA_DIR + ORIGIN_FILE)\ .setSchemaStr(SCHEMA_STRING) correlation = CorrelationBatchOp().linkFrom(source).collectCorrelation() colNames = correlation.getColNames() print("Correlation of " + colNames[0] + " with " + colNames[1] + " is " + str(correlation.getCorrelation()[0][1])); print(correlation.getCorrelationMatrix()) def print_correlation_info(correlationResult: CorrelationResult): colNames = correlationResult.getColNames() print("Correlation of " + colNames[0] + " with " + colNames[1] + " is " + str(correlationResult.getCorrelation()[0][1])) print(correlationResult.getCorrelationMatrix()) source\ .link( CorrelationBatchOp()\ .lazyCollectCorrelation(print_correlation_info) ) source.link( CorrelationBatchOp().lazyPrintCorrelation("< Pearson Correlation >") ) source.link( CorrelationBatchOp()\ .setMethod("SPEARMAN")\ .lazyPrintCorrelation("< Spearman Correlation >") ) BatchOperator.execute()
import matplotlib.pyplot as plt import seaborn as sns df_banknote = source.collectToDataframe() sns.pairplot(df_banknote, vars = df_banknote.columns[:-1], hue = 'class') plt.show()
from sklearn.manifold import TSNE tsne = TSNE(n_components = 2, learning_rate = 100).fit_transform(df_banknote.iloc[:, 0:4]) plt.scatter(tsne[:, 0], tsne[:, 1], c = df_banknote.iloc[:, 4]) plt.colorbar() plt.show()
#c_2 source = CsvSourceBatchOp()\ .setFilePath(DATA_DIR + ORIGIN_FILE)\ .setSchemaStr(SCHEMA_STRING) splitTrainTestIfNotExist(source, DATA_DIR + TRAIN_FILE, DATA_DIR + TEST_FILE, 0.8)
#c_3 train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE) test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE) lrTrainer = LogisticRegressionTrainBatchOp()\ .setFeatureCols(FEATURE_COL_NAMES)\ .setLabelCol(LABEL_COL_NAME) lrPredictor = LogisticRegressionPredictBatchOp()\ .setPredictionCol(PREDICTION_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME) train_data.link(lrTrainer) lrPredictor.linkFrom(lrTrainer, test_data) lrTrainer.lazyPrintTrainInfo().lazyPrintModelInfo() lrPredictor\ .lazyPrint(5, "< Prediction >")\ .link( AkSinkBatchOp()\ .setFilePath(DATA_DIR + LR_PRED_FILE)\ .setOverwriteSink(True) ) BatchOperator.execute()
#c_4 train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE) test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE) svmTrainer = LinearSvmTrainBatchOp()\ .setFeatureCols(FEATURE_COL_NAMES)\ .setLabelCol(LABEL_COL_NAME) svmPredictor = LinearSvmPredictBatchOp()\ .setPredictionCol(PREDICTION_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME) train_data.link(svmTrainer) svmPredictor.linkFrom(svmTrainer, test_data) svmTrainer.lazyPrintTrainInfo().lazyPrintModelInfo() svmPredictor\ .lazyPrint(5, "< Prediction >")\ .link( AkSinkBatchOp()\ .setFilePath(DATA_DIR + SVM_PRED_FILE)\ .setOverwriteSink(True) ) BatchOperator.execute()
#c_5 lr_metrics = EvalBinaryClassBatchOp()\ .setPositiveLabelValueString("1")\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\ .linkFrom( AkSourceBatchOp().setFilePath(DATA_DIR + LR_PRED_FILE) )\ .collectMetrics() print("< LR >") print("AUC : " + str(lr_metrics.getAuc()) + "\t Accuracy : " + str(lr_metrics.getAccuracy()) + "\t Precision : " + str(lr_metrics.getPrecision()) + "\t Recall : " + str(lr_metrics.getRecall()) ) print(lr_metrics) lr_metrics.saveRocCurveAsImage(DATA_DIR + "lr_roc.jpg", True) lr_metrics.saveRecallPrecisionCurveAsImage(DATA_DIR + "lr_recallprec.jpg", True) lr_metrics.saveLiftChartAsImage(DATA_DIR + "lr_lift.jpg", True) lr_metrics.saveKSAsImage(DATA_DIR + "lr_ks.jpg", True) AkSourceBatchOp()\ .setFilePath(DATA_DIR + SVM_PRED_FILE)\ .link( EvalBinaryClassBatchOp()\ .setPositiveLabelValueString("1")\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\ .lazyPrintMetrics() .lazyCollectMetrics( lambda binaryClassMetrics:( binaryClassMetrics.saveRocCurveAsImage( DATA_DIR + "svm_roc.jpg", True), binaryClassMetrics.saveRecallPrecisionCurveAsImage( DATA_DIR + "svm_recallprec.jpg", True), binaryClassMetrics.saveLiftChartAsImage( DATA_DIR + "svm_lift.jpg", True), binaryClassMetrics.saveKSAsImage( DATA_DIR + "svm_ks.jpg", True) ) ) ) BatchOperator.execute()
#c_6 train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE); test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE); featureExpand = Pipeline()\ .add( VectorAssembler()\ .setSelectedCols(FEATURE_COL_NAMES)\ .setOutputCol(VEC_COL_NAME + "_0") )\ .add( VectorPolynomialExpand()\ .setSelectedCol(VEC_COL_NAME + "_0")\ .setOutputCol(VEC_COL_NAME)\ .setDegree(2) )\ .fit(train_data) train_data = featureExpand.transform(train_data); test_data = featureExpand.transform(test_data); train_data.lazyPrint(1); LinearSvm()\ .setVectorCol(VEC_COL_NAME)\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\ .fit(train_data)\ .transform(test_data)\ .link( EvalBinaryClassBatchOp()\ .setPositiveLabelValueString("1")\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\ .lazyPrintMetrics("LinearSVM") ) LogisticRegression()\ .setVectorCol(VEC_COL_NAME)\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\ .fit(train_data)\ .transform(test_data)\ .link( EvalBinaryClassBatchOp()\ .setPositiveLabelValueString("1")\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\ .lazyPrintMetrics("LogisticRegression") ) LogisticRegression()\ .setOptimMethod("Newton")\ .setVectorCol(VEC_COL_NAME)\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\ .fit(train_data)\ .transform(test_data)\ .link( EvalBinaryClassBatchOp()\ .setPositiveLabelValueString("1")\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\ .lazyPrintMetrics("LogisticRegression + OptimMethod.Newton") ) BatchOperator.execute()
#c_7 train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE); test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE); FmClassifier()\ .setNumEpochs(10)\ .setLearnRate(0.5)\ .setNumFactor(2)\ .setFeatureCols(FEATURE_COL_NAMES)\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionCol(PREDICTION_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\ .enableLazyPrintTrainInfo()\ .enableLazyPrintModelInfo()\ .fit(train_data)\ .transform(test_data)\ .link( EvalBinaryClassBatchOp()\ .setPositiveLabelValueString("1")\ .setLabelCol(LABEL_COL_NAME)\ .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\ .lazyPrintMetrics("FM") ) BatchOperator.execute()