本章包括下面各节:
15.1 平均数
15.2 向平均数方向的“回归”
15.3 线性回归
详细内容请阅读纸质书《Alink权威指南:基于Flink的机器学习实例入门(Python)》,这里为本章对应的示例代码。
from pyalink.alink import * useLocalEnv(1) from utils import * import os import pandas as pd DATA_DIR = ROOT_DIR + "father_son" + os.sep ORIGIN_FILE = "Pearson.txt";
source = CsvSourceBatchOp()\
.setFilePath(DATA_DIR + ORIGIN_FILE)\
.setSchemaStr("father double, son double")\
.setFieldDelimiter("\t")\
.setIgnoreFirstLine(True);
source.firstN(5).print();
import matplotlib.pyplot as plt df_source = source.collectToDataframe() plt.figure(figsize=(8, 8)) plt.scatter(df_source['father'], df_source['son'], color='blue', s=2) plt.show()
source.lazyPrintStatistics(); BatchOperator.execute()
df_plus_one = source\
.select("father, son, father+1 AS plus_one")\
.collectToDataframe()
plt.figure(figsize=(8, 8))
plt.scatter(df_source['father'], df_source['son'], color='blue', s=2)
plt.plot(df_plus_one['father'], df_plus_one['plus_one'], color='grey', linewidth=2)
plt.show()
source.filter("father>=71.5 AND father<72.5").lazyPrintStatistics("father 72");
source.filter("father>=64.5 AND father<65.5").lazyPrintStatistics("father 65");
BatchOperator.execute()
linear_model = LinearRegTrainBatchOp()\
.setFeatureCols(["father"])\
.setLabelCol("son")\
.linkFrom(source);
linear_model.lazyPrintTrainInfo();
linear_model.lazyPrintModelInfo();
linear_reg = LinearRegPredictBatchOp()\
.setPredictionCol("linear_reg")\
.linkFrom(linear_model, source);
linear_reg.lazyPrint(5);
BatchOperator.execute();
df_linear_reg = linear_reg.collectToDataframe() plt.figure(figsize=(8, 8)) plt.scatter(df_source['father'], df_source['son'], color='blue', s=2) plt.plot(df_plus_one['father'], df_plus_one['plus_one'], color='grey', linewidth=2) plt.plot(df_linear_reg['father'], df_linear_reg['linear_reg'], color='red', linewidth=2) plt.show()
```python
```