本章学习目标

通过本章学习,你将掌握: - 机器学习基础概念和Spark MLlib框架 - 数据预处理和特征工程技术 - 分类、回归、聚类等核心算法 - 推荐系统的设计和实现 - 模型评估和超参数调优 - 机器学习项目的完整开发流程


5.1 机器学习基础

5.1.1 机器学习概述

机器学习是人工智能的一个重要分支,通过算法让计算机从数据中学习模式,并对新数据进行预测或决策。

机器学习类型

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Any

class MachineLearningOverview:
    """
    机器学习概述和基础概念演示
    """
    
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("MachineLearningOverview") \
            .config("spark.sql.adaptive.enabled", "true") \
            .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
            .getOrCreate()
        
        # 设置日志级别
        self.spark.sparkContext.setLogLevel("WARN")
    
    def demonstrate_ml_types(self):
        """
        演示不同类型的机器学习任务
        """
        print("=== 机器学习类型演示 ===")
        
        # 1. 监督学习 - 分类任务
        print("\n1. 监督学习 - 分类任务")
        classification_data = self.spark.createDataFrame([
            (1.0, 2.0, 1.0),  # 特征1, 特征2, 标签
            (2.0, 3.0, 1.0),
            (3.0, 1.0, 0.0),
            (4.0, 2.0, 0.0),
            (5.0, 4.0, 1.0),
            (6.0, 1.0, 0.0)
        ], ["feature1", "feature2", "label"])
        
        # 特征向量化
        assembler = VectorAssembler(
            inputCols=["feature1", "feature2"],
            outputCol="features"
        )
        
        # 逻辑回归分类器
        lr = LogisticRegression(
            featuresCol="features",
            labelCol="label",
            maxIter=10
        )
        
        # 创建管道
        pipeline = Pipeline(stages=[assembler, lr])
        model = pipeline.fit(classification_data)
        
        # 预测
        predictions = model.transform(classification_data)
        predictions.select("features", "label", "prediction", "probability").show()
        
        # 2. 监督学习 - 回归任务
        print("\n2. 监督学习 - 回归任务")
        regression_data = self.spark.createDataFrame([
            (1.0, 2.0, 3.5),  # 特征1, 特征2, 目标值
            (2.0, 3.0, 5.2),
            (3.0, 1.0, 4.1),
            (4.0, 2.0, 6.3),
            (5.0, 4.0, 8.1),
            (6.0, 1.0, 5.9)
        ], ["feature1", "feature2", "target"])
        
        # 线性回归
        lr_reg = LinearRegression(
            featuresCol="features",
            labelCol="target",
            maxIter=10
        )
        
        pipeline_reg = Pipeline(stages=[assembler, lr_reg])
        model_reg = pipeline_reg.fit(regression_data)
        
        predictions_reg = model_reg.transform(regression_data)
        predictions_reg.select("features", "target", "prediction").show()
        
        # 3. 无监督学习 - 聚类任务
        print("\n3. 无监督学习 - 聚类任务")
        clustering_data = self.spark.createDataFrame([
            (1.0, 2.0),
            (2.0, 3.0),
            (8.0, 9.0),
            (9.0, 8.0),
            (1.5, 2.5),
            (8.5, 9.5)
        ], ["feature1", "feature2"])
        
        # K-means聚类
        kmeans = KMeans(
            featuresCol="features",
            predictionCol="cluster",
            k=2,
            seed=42
        )
        
        pipeline_cluster = Pipeline(stages=[assembler, kmeans])
        model_cluster = pipeline_cluster.fit(clustering_data)
        
        predictions_cluster = model_cluster.transform(clustering_data)
        predictions_cluster.select("features", "cluster").show()
        
        return {
            'classification': predictions.toPandas(),
            'regression': predictions_reg.toPandas(),
            'clustering': predictions_cluster.toPandas()
        }
    
    def demonstrate_ml_workflow(self):
        """
        演示机器学习工作流程
        """
        print("\n=== 机器学习工作流程演示 ===")
        
        # 1. 数据收集和加载
        print("\n1. 数据收集和加载")
        # 创建示例数据集
        data = self.spark.createDataFrame([
            ("Alice", 25, "Engineer", 75000, 1),
            ("Bob", 30, "Manager", 85000, 1),
            ("Charlie", 35, "Director", 95000, 1),
            ("David", 28, "Engineer", 70000, 0),
            ("Eve", 32, "Manager", 80000, 1),
            ("Frank", 45, "Director", 100000, 1),
            ("Grace", 26, "Engineer", 72000, 0),
            ("Henry", 38, "Manager", 88000, 1)
        ], ["name", "age", "position", "salary", "promotion"])
        
        print("原始数据:")
        data.show()
        
        # 2. 数据探索和分析
        print("\n2. 数据探索和分析")
        print("数据统计信息:")
        data.describe().show()
        
        print("职位分布:")
        data.groupBy("position").count().show()
        
        # 3. 特征工程
        print("\n3. 特征工程")
        
        # 字符串索引化
        position_indexer = StringIndexer(
            inputCol="position",
            outputCol="position_index"
        )
        
        # 特征向量化
        feature_assembler = VectorAssembler(
            inputCols=["age", "salary", "position_index"],
            outputCol="features"
        )
        
        # 特征标准化
        scaler = StandardScaler(
            inputCol="features",
            outputCol="scaled_features",
            withStd=True,
            withMean=True
        )
        
        # 4. 模型训练
        print("\n4. 模型训练")
        classifier = LogisticRegression(
            featuresCol="scaled_features",
            labelCol="promotion",
            maxIter=100
        )
        
        # 创建完整的机器学习管道
        ml_pipeline = Pipeline(stages=[
            position_indexer,
            feature_assembler,
            scaler,
            classifier
        ])
        
        # 训练模型
        model = ml_pipeline.fit(data)
        
        # 5. 模型评估
        print("\n5. 模型评估")
        predictions = model.transform(data)
        predictions.select(
            "name", "age", "position", "salary", 
            "promotion", "prediction", "probability"
        ).show()
        
        # 计算准确率
        evaluator = BinaryClassificationEvaluator(
            labelCol="promotion",
            rawPredictionCol="rawPrediction",
            metricName="areaUnderROC"
        )
        
        auc = evaluator.evaluate(predictions)
        print(f"模型AUC: {auc:.4f}")
        
        # 6. 模型部署和预测
        print("\n6. 模型部署和预测")
        new_data = self.spark.createDataFrame([
            ("John", 29, "Engineer", 73000),
            ("Jane", 33, "Manager", 82000)
        ], ["name", "age", "position", "salary"])
        
        new_predictions = model.transform(new_data)
        new_predictions.select(
            "name", "age", "position", "salary", 
            "prediction", "probability"
        ).show()
        
        return {
            'model': model,
            'predictions': predictions.toPandas(),
            'auc': auc
        }
    
    def compare_ml_algorithms(self):
        """
        比较不同机器学习算法
        """
        print("\n=== 机器学习算法比较 ===")
        
        # 创建更大的数据集
        np.random.seed(42)
        n_samples = 1000
        
        # 生成分类数据
        X1 = np.random.normal(2, 1, n_samples//2)
        X2 = np.random.normal(1, 1, n_samples//2)
        y1 = np.ones(n_samples//2)
        
        X3 = np.random.normal(-1, 1, n_samples//2)
        X4 = np.random.normal(-2, 1, n_samples//2)
        y2 = np.zeros(n_samples//2)
        
        # 合并数据
        features = list(zip(
            np.concatenate([X1, X3]),
            np.concatenate([X2, X4]),
            np.concatenate([y1, y2])
        ))
        
        data = self.spark.createDataFrame(
            features, ["feature1", "feature2", "label"]
        )
        
        # 特征向量化
        assembler = VectorAssembler(
            inputCols=["feature1", "feature2"],
            outputCol="features"
        )
        
        data_vectorized = assembler.transform(data)
        
        # 数据分割
        train_data, test_data = data_vectorized.randomSplit([0.8, 0.2], seed=42)
        
        # 算法比较
        algorithms = {
            'LogisticRegression': LogisticRegression(
                featuresCol="features",
                labelCol="label",
                maxIter=100
            )
        }
        
        results = {}
        evaluator = BinaryClassificationEvaluator(
            labelCol="label",
            rawPredictionCol="rawPrediction",
            metricName="areaUnderROC"
        )
        
        for name, algorithm in algorithms.items():
            print(f"\n训练 {name}...")
            
            # 训练模型
            model = algorithm.fit(train_data)
            
            # 预测
            predictions = model.transform(test_data)
            
            # 评估
            auc = evaluator.evaluate(predictions)
            
            results[name] = {
                'model': model,
                'auc': auc,
                'predictions': predictions
            }
            
            print(f"{name} AUC: {auc:.4f}")
        
        return results
    
    def visualize_ml_concepts(self, results: Dict[str, Any]):
        """
        可视化机器学习概念
        """
        print("\n=== 机器学习概念可视化 ===")
        
        # 创建图形
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('机器学习概念可视化', fontsize=16, fontweight='bold')
        
        # 1. 分类结果可视化
        if 'classification' in results:
            df_class = results['classification']
            ax = axes[0, 0]
            
            # 提取特征
            features = np.array([list(row) for row in df_class['features']])
            
            scatter = ax.scatter(
                features[:, 0], features[:, 1],
                c=df_class['prediction'],
                cmap='viridis',
                alpha=0.7
            )
            ax.set_title('分类结果')
            ax.set_xlabel('特征1')
            ax.set_ylabel('特征2')
            plt.colorbar(scatter, ax=ax)
        
        # 2. 回归结果可视化
        if 'regression' in results:
            df_reg = results['regression']
            ax = axes[0, 1]
            
            ax.scatter(df_reg['target'], df_reg['prediction'], alpha=0.7)
            ax.plot([df_reg['target'].min(), df_reg['target'].max()],
                   [df_reg['target'].min(), df_reg['target'].max()],
                   'r--', lw=2)
            ax.set_title('回归预测 vs 实际值')
            ax.set_xlabel('实际值')
            ax.set_ylabel('预测值')
        
        # 3. 聚类结果可视化
        if 'clustering' in results:
            df_cluster = results['clustering']
            ax = axes[0, 2]
            
            features = np.array([list(row) for row in df_cluster['features']])
            
            scatter = ax.scatter(
                features[:, 0], features[:, 1],
                c=df_cluster['cluster'],
                cmap='tab10',
                alpha=0.7
            )
            ax.set_title('聚类结果')
            ax.set_xlabel('特征1')
            ax.set_ylabel('特征2')
            plt.colorbar(scatter, ax=ax)
        
        # 4. 机器学习工作流程
        ax = axes[1, 0]
        workflow_steps = ['数据收集', '数据预处理', '特征工程', '模型训练', '模型评估', '模型部署']
        y_pos = np.arange(len(workflow_steps))
        
        ax.barh(y_pos, [1]*len(workflow_steps), color='skyblue', alpha=0.7)
        ax.set_yticks(y_pos)
        ax.set_yticklabels(workflow_steps)
        ax.set_title('机器学习工作流程')
        ax.set_xlabel('完成度')
        
        # 5. 算法类型分布
        ax = axes[1, 1]
        ml_types = ['监督学习', '无监督学习', '强化学习']
        sizes = [60, 30, 10]
        colors = ['lightcoral', 'lightskyblue', 'lightgreen']
        
        ax.pie(sizes, labels=ml_types, colors=colors, autopct='%1.1f%%', startangle=90)
        ax.set_title('机器学习类型分布')
        
        # 6. 模型性能比较
        ax = axes[1, 2]
        if 'model' in results:
            metrics = ['准确率', 'AUC', '召回率', 'F1分数']
            values = [0.85, results.get('auc', 0.8), 0.82, 0.83]
            
            bars = ax.bar(metrics, values, color=['red', 'green', 'blue', 'orange'], alpha=0.7)
            ax.set_title('模型性能指标')
            ax.set_ylabel('分数')
            ax.set_ylim(0, 1)
            
            # 添加数值标签
            for bar, value in zip(bars, values):
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                       f'{value:.3f}', ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()
        
        # 机器学习算法决策树
        self._plot_ml_decision_tree()
    
    def _plot_ml_decision_tree(self):
        """
        绘制机器学习算法选择决策树
        """
        fig, ax = plt.subplots(1, 1, figsize=(14, 10))
        
        # 决策树节点
        nodes = {
            'root': (0.5, 0.9, '机器学习任务'),
            'supervised': (0.2, 0.7, '监督学习\n(有标签数据)'),
            'unsupervised': (0.8, 0.7, '无监督学习\n(无标签数据)'),
            'classification': (0.1, 0.5, '分类\n(离散输出)'),
            'regression': (0.3, 0.5, '回归\n(连续输出)'),
            'clustering': (0.7, 0.5, '聚类\n(数据分组)'),
            'dimensionality': (0.9, 0.5, '降维\n(特征减少)'),
            'lr': (0.05, 0.3, '逻辑回归'),
            'dt': (0.15, 0.3, '决策树'),
            'linear': (0.25, 0.3, '线性回归'),
            'rf': (0.35, 0.3, '随机森林'),
            'kmeans': (0.65, 0.3, 'K-means'),
            'dbscan': (0.75, 0.3, 'DBSCAN'),
            'pca': (0.85, 0.3, 'PCA'),
            'tsne': (0.95, 0.3, 't-SNE')
        }
        
        # 绘制节点
        for node, (x, y, text) in nodes.items():
            if node == 'root':
                color = 'lightblue'
            elif node in ['supervised', 'unsupervised']:
                color = 'lightgreen'
            elif node in ['classification', 'regression', 'clustering', 'dimensionality']:
                color = 'lightyellow'
            else:
                color = 'lightcoral'
            
            ax.add_patch(plt.Rectangle((x-0.04, y-0.03), 0.08, 0.06, 
                                     facecolor=color, edgecolor='black', linewidth=1))
            ax.text(x, y, text, ha='center', va='center', fontsize=8, fontweight='bold')
        
        # 绘制连接线
        connections = [
            ('root', 'supervised'),
            ('root', 'unsupervised'),
            ('supervised', 'classification'),
            ('supervised', 'regression'),
            ('unsupervised', 'clustering'),
            ('unsupervised', 'dimensionality'),
            ('classification', 'lr'),
            ('classification', 'dt'),
            ('regression', 'linear'),
            ('regression', 'rf'),
            ('clustering', 'kmeans'),
            ('clustering', 'dbscan'),
            ('dimensionality', 'pca'),
            ('dimensionality', 'tsne')
        ]
        
        for start, end in connections:
            x1, y1, _ = nodes[start]
            x2, y2, _ = nodes[end]
            ax.plot([x1, x2], [y1, y2], 'k-', alpha=0.6, linewidth=1)
        
        ax.set_xlim(-0.05, 1.05)
        ax.set_ylim(0.2, 1.0)
        ax.set_title('机器学习算法选择决策树', fontsize=14, fontweight='bold')
        ax.axis('off')
        
        plt.tight_layout()
        plt.show()
    
    def cleanup(self):
        """
        清理资源
        """
        self.spark.stop()

# 使用示例
if __name__ == "__main__":
    # 创建机器学习概述实例
    ml_overview = MachineLearningOverview()
    
    try:
        # 演示机器学习类型
        ml_types_results = ml_overview.demonstrate_ml_types()
        
        # 演示机器学习工作流程
        workflow_results = ml_overview.demonstrate_ml_workflow()
        
        # 比较机器学习算法
        algorithm_results = ml_overview.compare_ml_algorithms()
        
        # 合并结果
        all_results = {**ml_types_results, **workflow_results}
        
        # 可视化结果
        ml_overview.visualize_ml_concepts(all_results)
        
    finally:
        # 清理资源
        ml_overview.cleanup()

5.1.2 Spark MLlib架构

Spark MLlib是Spark的机器学习库,提供了丰富的算法和工具。

MLlib核心组件

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, Transformer, Estimator
from pyspark.ml.param import Param, Params
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml.regression import *
from pyspark.ml.clustering import *
from pyspark.ml.recommendation import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import *
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from typing import Dict, List, Any

class SparkMLlibArchitecture:
    """
    Spark MLlib架构和核心组件演示
    """
    
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("SparkMLlibArchitecture") \
            .config("spark.sql.adaptive.enabled", "true") \
            .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
            .getOrCreate()
        
        self.spark.sparkContext.setLogLevel("WARN")
    
    def demonstrate_mllib_components(self):
        """
        演示MLlib核心组件
        """
        print("=== Spark MLlib核心组件演示 ===")
        
        # 创建示例数据
        data = self.spark.createDataFrame([
            (0, "Hi I heard about Spark", 1.0),
            (1, "I wish Java could use case classes", 0.0),
            (2, "Logistic regression models are neat", 1.0),
            (3, "Machine learning is awesome", 1.0),
            (4, "Programming is difficult", 0.0),
            (5, "Spark MLlib is powerful", 1.0)
        ], ["id", "text", "label"])
        
        print("\n1. Transformer组件演示")
        
        # 1.1 Tokenizer - 文本分词
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        tokenized_data = tokenizer.transform(data)
        print("分词结果:")
        tokenized_data.select("text", "words").show(truncate=False)
        
        # 1.2 HashingTF - 词频向量化
        hashing_tf = HashingTF(inputCol="words", outputCol="raw_features", numFeatures=20)
        tf_data = hashing_tf.transform(tokenized_data)
        print("\n词频向量化结果:")
        tf_data.select("words", "raw_features").show(truncate=False)
        
        # 1.3 IDF - 逆文档频率
        idf = IDF(inputCol="raw_features", outputCol="features")
        idf_model = idf.fit(tf_data)
        tfidf_data = idf_model.transform(tf_data)
        print("\nTF-IDF结果:")
        tfidf_data.select("raw_features", "features").show(truncate=False)
        
        print("\n2. Estimator组件演示")
        
        # 2.1 逻辑回归估计器
        lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
        
        # 训练模型(Estimator.fit() -> Model)
        lr_model = lr.fit(tfidf_data)
        print(f"逻辑回归模型系数: {lr_model.coefficients}")
        print(f"逻辑回归模型截距: {lr_model.intercept}")
        
        # 2.2 模型预测(Model是特殊的Transformer)
        predictions = lr_model.transform(tfidf_data)
        print("\n预测结果:")
        predictions.select("text", "label", "prediction", "probability").show(truncate=False)
        
        print("\n3. Pipeline组件演示")
        
        # 创建机器学习管道
        pipeline = Pipeline(stages=[
            tokenizer,      # Stage 1: 分词
            hashing_tf,     # Stage 2: 词频向量化
            idf,           # Stage 3: IDF计算
            lr             # Stage 4: 逻辑回归
        ])
        
        # 训练管道
        pipeline_model = pipeline.fit(data)
        
        # 使用管道进行预测
        pipeline_predictions = pipeline_model.transform(data)
        print("\n管道预测结果:")
        pipeline_predictions.select("text", "label", "prediction").show(truncate=False)
        
        return {
            'tokenized_data': tokenized_data.toPandas(),
            'tfidf_data': tfidf_data.toPandas(),
            'predictions': predictions.toPandas(),
            'pipeline_model': pipeline_model
        }
    
    def demonstrate_feature_engineering(self):
        """
        演示特征工程组件
        """
        print("\n=== 特征工程组件演示 ===")
        
        # 创建数值和分类特征数据
        data = self.spark.createDataFrame([
            (0, 18, "student", "single", 1000.0, 1),
            (1, 25, "engineer", "married", 5000.0, 1),
            (2, 35, "manager", "married", 8000.0, 1),
            (3, 22, "student", "single", 1200.0, 0),
            (4, 45, "director", "divorced", 12000.0, 1),
            (5, 28, "engineer", "single", 4500.0, 0)
        ], ["id", "age", "job", "marital", "salary", "approved"])
        
        print("原始数据:")
        data.show()
        
        # 1. 字符串索引化
        print("\n1. 字符串索引化")
        job_indexer = StringIndexer(inputCol="job", outputCol="job_index")
        marital_indexer = StringIndexer(inputCol="marital", outputCol="marital_index")
        
        indexed_data = job_indexer.fit(data).transform(data)
        indexed_data = marital_indexer.fit(indexed_data).transform(indexed_data)
        
        print("索引化结果:")
        indexed_data.select("job", "job_index", "marital", "marital_index").show()
        
        # 2. 独热编码
        print("\n2. 独热编码")
        job_encoder = OneHotEncoder(inputCol="job_index", outputCol="job_vec")
        marital_encoder = OneHotEncoder(inputCol="marital_index", outputCol="marital_vec")
        
        encoded_data = job_encoder.fit(indexed_data).transform(indexed_data)
        encoded_data = marital_encoder.fit(encoded_data).transform(encoded_data)
        
        print("独热编码结果:")
        encoded_data.select("job", "job_vec", "marital", "marital_vec").show(truncate=False)
        
        # 3. 数值特征标准化
        print("\n3. 数值特征标准化")
        
        # 首先组装数值特征
        numeric_assembler = VectorAssembler(
            inputCols=["age", "salary"],
            outputCol="numeric_features"
        )
        
        numeric_data = numeric_assembler.transform(encoded_data)
        
        # 标准化
        scaler = StandardScaler(
            inputCol="numeric_features",
            outputCol="scaled_numeric_features",
            withStd=True,
            withMean=True
        )
        
        scaled_data = scaler.fit(numeric_data).transform(numeric_data)
        
        print("标准化结果:")
        scaled_data.select("numeric_features", "scaled_numeric_features").show(truncate=False)
        
        # 4. 特征选择
        print("\n4. 特征选择")
        
        # 组装所有特征
        feature_assembler = VectorAssembler(
            inputCols=["scaled_numeric_features", "job_vec", "marital_vec"],
            outputCol="all_features"
        )
        
        all_features_data = feature_assembler.transform(scaled_data)
        
        # 使用卡方检验进行特征选择
        selector = ChiSqSelector(
            featuresCol="all_features",
            outputCol="selected_features",
            labelCol="approved",
            numTopFeatures=5
        )
        
        selected_data = selector.fit(all_features_data).transform(all_features_data)
        
        print("特征选择结果:")
        selected_data.select("all_features", "selected_features").show(truncate=False)
        
        # 5. 主成分分析(PCA)
        print("\n5. 主成分分析(PCA)")
        pca = PCA(
            inputCol="all_features",
            outputCol="pca_features",
            k=3
        )
        
        pca_data = pca.fit(all_features_data).transform(all_features_data)
        
        print("PCA结果:")
        pca_data.select("all_features", "pca_features").show(truncate=False)
        
        return {
            'indexed_data': indexed_data.toPandas(),
            'encoded_data': encoded_data.toPandas(),
            'scaled_data': scaled_data.toPandas(),
            'selected_data': selected_data.toPandas(),
            'pca_data': pca_data.toPandas()
        }
    
    def demonstrate_model_evaluation(self):
        """
        演示模型评估组件
        """
        print("\n=== 模型评估组件演示 ===")
        
        # 创建分类数据
        np.random.seed(42)
        n_samples = 1000
        
        # 生成特征
        feature1 = np.random.normal(0, 1, n_samples)
        feature2 = np.random.normal(0, 1, n_samples)
        
        # 生成标签(基于特征的线性组合加噪声)
        labels = (feature1 + feature2 + np.random.normal(0, 0.5, n_samples) > 0).astype(float)
        
        # 创建DataFrame
        data = self.spark.createDataFrame(
            list(zip(feature1, feature2, labels)),
            ["feature1", "feature2", "label"]
        )
        
        # 特征向量化
        assembler = VectorAssembler(
            inputCols=["feature1", "feature2"],
            outputCol="features"
        )
        
        vectorized_data = assembler.transform(data)
        
        # 数据分割
        train_data, test_data = vectorized_data.randomSplit([0.8, 0.2], seed=42)
        
        # 训练多个模型
        models = {}
        predictions = {}
        
        # 1. 逻辑回归
        lr = LogisticRegression(featuresCol="features", labelCol="label")
        lr_model = lr.fit(train_data)
        models['LogisticRegression'] = lr_model
        predictions['LogisticRegression'] = lr_model.transform(test_data)
        
        # 2. 决策树
        from pyspark.ml.classification import DecisionTreeClassifier
        dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
        dt_model = dt.fit(train_data)
        models['DecisionTree'] = dt_model
        predictions['DecisionTree'] = dt_model.transform(test_data)
        
        # 评估模型
        print("\n模型评估结果:")
        
        # 二分类评估器
        binary_evaluator = BinaryClassificationEvaluator(
            labelCol="label",
            rawPredictionCol="rawPrediction"
        )
        
        # 多分类评估器
        multi_evaluator = MulticlassClassificationEvaluator(
            labelCol="label",
            predictionCol="prediction"
        )
        
        evaluation_results = {}
        
        for model_name, pred_data in predictions.items():
            print(f"\n{model_name} 评估结果:")
            
            # AUC
            auc = binary_evaluator.evaluate(pred_data, {binary_evaluator.metricName: "areaUnderROC"})
            print(f"  AUC: {auc:.4f}")
            
            # 准确率
            accuracy = multi_evaluator.evaluate(pred_data, {multi_evaluator.metricName: "accuracy"})
            print(f"  准确率: {accuracy:.4f}")
            
            # 精确率
            precision = multi_evaluator.evaluate(pred_data, {multi_evaluator.metricName: "weightedPrecision"})
            print(f"  精确率: {precision:.4f}")
            
            # 召回率
            recall = multi_evaluator.evaluate(pred_data, {multi_evaluator.metricName: "weightedRecall"})
            print(f"  召回率: {recall:.4f}")
            
            # F1分数
            f1 = multi_evaluator.evaluate(pred_data, {multi_evaluator.metricName: "f1"})
            print(f"  F1分数: {f1:.4f}")
            
            evaluation_results[model_name] = {
                'auc': auc,
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1
            }
        
        return {
            'models': models,
            'predictions': predictions,
            'evaluation_results': evaluation_results
        }
    
    def demonstrate_hyperparameter_tuning(self):
        """
        演示超参数调优
        """
        print("\n=== 超参数调优演示 ===")
        
        # 创建数据
        data = self.spark.createDataFrame([
            (1.0, 2.0, 1.0),
            (2.0, 3.0, 1.0),
            (3.0, 1.0, 0.0),
            (4.0, 2.0, 0.0),
            (5.0, 4.0, 1.0),
            (6.0, 1.0, 0.0),
            (7.0, 3.0, 1.0),
            (8.0, 2.0, 0.0)
        ] * 50, ["feature1", "feature2", "label"])  # 扩大数据集
        
        # 特征向量化
        assembler = VectorAssembler(
            inputCols=["feature1", "feature2"],
            outputCol="features"
        )
        
        vectorized_data = assembler.transform(data)
        
        # 数据分割
        train_data, test_data = vectorized_data.randomSplit([0.8, 0.2], seed=42)
        
        # 1. 网格搜索
        print("\n1. 网格搜索超参数调优")
        
        # 创建逻辑回归模型
        lr = LogisticRegression(featuresCol="features", labelCol="label")
        
        # 创建参数网格
        from pyspark.ml.tuning import ParamGridBuilder
        param_grid = ParamGridBuilder() \
            .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
            .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
            .build()
        
        # 创建交叉验证器
        from pyspark.ml.tuning import CrossValidator
        evaluator = BinaryClassificationEvaluator(
            labelCol="label",
            rawPredictionCol="rawPrediction",
            metricName="areaUnderROC"
        )
        
        cv = CrossValidator(
            estimator=lr,
            estimatorParamMaps=param_grid,
            evaluator=evaluator,
            numFolds=3,
            seed=42
        )
        
        # 训练模型
        cv_model = cv.fit(train_data)
        
        # 获取最佳模型
        best_model = cv_model.bestModel
        print(f"最佳正则化参数: {best_model.getRegParam()}")
        print(f"最佳弹性网络参数: {best_model.getElasticNetParam()}")
        
        # 评估最佳模型
        predictions = cv_model.transform(test_data)
        auc = evaluator.evaluate(predictions)
        print(f"最佳模型AUC: {auc:.4f}")
        
        # 2. 训练验证分割
        print("\n2. 训练验证分割调优")
        
        from pyspark.ml.tuning import TrainValidationSplit
        
        tvs = TrainValidationSplit(
            estimator=lr,
            estimatorParamMaps=param_grid,
            evaluator=evaluator,
            trainRatio=0.8,
            seed=42
        )
        
        tvs_model = tvs.fit(train_data)
        tvs_best_model = tvs_model.bestModel
        
        print(f"TVS最佳正则化参数: {tvs_best_model.getRegParam()}")
        print(f"TVS最佳弹性网络参数: {tvs_best_model.getElasticNetParam()}")
        
        tvs_predictions = tvs_model.transform(test_data)
        tvs_auc = evaluator.evaluate(tvs_predictions)
        print(f"TVS最佳模型AUC: {tvs_auc:.4f}")
        
        return {
            'cv_model': cv_model,
            'tvs_model': tvs_model,
            'cv_auc': auc,
            'tvs_auc': tvs_auc,
            'best_params': {
                'regParam': best_model.getRegParam(),
                'elasticNetParam': best_model.getElasticNetParam()
            }
        }
    
    def visualize_mllib_architecture(self, results: Dict[str, Any]):
        """
        可视化MLlib架构和组件
        """
        print("\n=== MLlib架构可视化 ===")
        
        # 创建图形
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('Spark MLlib架构和组件', fontsize=16, fontweight='bold')
        
        # 1. MLlib组件架构
        ax = axes[0, 0]
        components = ['DataFrame', 'Transformer', 'Estimator', 'Pipeline', 'Evaluator', 'Tuning']
        y_pos = np.arange(len(components))
        colors = ['lightblue', 'lightgreen', 'lightyellow', 'lightcoral', 'lightpink', 'lightgray']
        
        bars = ax.barh(y_pos, [1]*len(components), color=colors, alpha=0.7)
        ax.set_yticks(y_pos)
        ax.set_yticklabels(components)
        ax.set_title('MLlib核心组件')
        ax.set_xlabel('重要性')
        
        # 2. 特征工程流程
        ax = axes[0, 1]
        if 'indexed_data' in results:
            steps = ['原始数据', '索引化', '编码', '标准化', '特征选择']
            values = [100, 85, 75, 65, 50]
            
            ax.plot(steps, values, 'o-', linewidth=2, markersize=8, color='blue')
            ax.set_title('特征工程流程')
            ax.set_ylabel('数据维度')
            ax.tick_params(axis='x', rotation=45)
        
        # 3. 模型性能比较
        ax = axes[0, 2]
        if 'evaluation_results' in results:
            models = list(results['evaluation_results'].keys())
            metrics = ['auc', 'accuracy', 'precision', 'recall', 'f1']
            
            x = np.arange(len(models))
            width = 0.15
            
            for i, metric in enumerate(metrics):
                values = [results['evaluation_results'][model][metric] for model in models]
                ax.bar(x + i*width, values, width, label=metric, alpha=0.7)
            
            ax.set_title('模型性能比较')
            ax.set_xlabel('模型')
            ax.set_ylabel('分数')
            ax.set_xticks(x + width * 2)
            ax.set_xticklabels(models)
            ax.legend()
        
        # 4. 超参数调优结果
        ax = axes[1, 0]
        if 'cv_auc' in results and 'tvs_auc' in results:
            methods = ['CrossValidator', 'TrainValidationSplit']
            aucs = [results['cv_auc'], results['tvs_auc']]
            
            bars = ax.bar(methods, aucs, color=['skyblue', 'lightcoral'], alpha=0.7)
            ax.set_title('超参数调优方法比较')
            ax.set_ylabel('AUC')
            
            # 添加数值标签
            for bar, auc in zip(bars, aucs):
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                       f'{auc:.3f}', ha='center', va='bottom')
        
        # 5. 机器学习管道
        ax = axes[1, 1]
        pipeline_stages = ['数据加载', 'Transformer1', 'Transformer2', 'Estimator', '模型']
        x_pos = np.arange(len(pipeline_stages))
        
        # 绘制管道流程
        for i in range(len(pipeline_stages)-1):
            ax.arrow(i, 0.5, 0.8, 0, head_width=0.1, head_length=0.1, 
                    fc='blue', ec='blue', alpha=0.7)
        
        for i, stage in enumerate(pipeline_stages):
            ax.text(i, 0.5, stage, ha='center', va='center', 
                   bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.7))
        
        ax.set_xlim(-0.5, len(pipeline_stages)-0.5)
        ax.set_ylim(0, 1)
        ax.set_title('机器学习管道')
        ax.axis('off')
        
        # 6. MLlib算法分类
        ax = axes[1, 2]
        algorithm_types = {
            '分类': ['LogisticRegression', 'DecisionTree', 'RandomForest', 'NaiveBayes'],
            '回归': ['LinearRegression', 'DecisionTreeRegressor', 'RandomForestRegressor'],
            '聚类': ['KMeans', 'GaussianMixture', 'BisectingKMeans'],
            '推荐': ['ALS', 'CollaborativeFiltering']
        }
        
        y_offset = 0
        colors = ['red', 'green', 'blue', 'orange']
        
        for i, (category, algorithms) in enumerate(algorithm_types.items()):
            ax.barh(y_offset, len(algorithms), color=colors[i], alpha=0.7, label=category)
            y_offset += 1
        
        ax.set_title('MLlib算法分类')
        ax.set_xlabel('算法数量')
        ax.set_yticks(range(len(algorithm_types)))
        ax.set_yticklabels(algorithm_types.keys())
        ax.legend()
        
        plt.tight_layout()
        plt.show()
        
        # 绘制MLlib架构图
        self._plot_mllib_architecture()
    
    def _plot_mllib_architecture(self):
        """
        绘制MLlib详细架构图
        """
        fig, ax = plt.subplots(1, 1, figsize=(16, 10))
        
        # 架构层次
        layers = {
            'API层': {
                'y': 0.8,
                'components': ['DataFrame API', 'Pipeline API', 'ML Algorithms']
            },
            '核心组件层': {
                'y': 0.6,
                'components': ['Transformer', 'Estimator', 'Evaluator', 'Tuning']
            },
            '算法层': {
                'y': 0.4,
                'components': ['Classification', 'Regression', 'Clustering', 'Recommendation']
            },
            '基础设施层': {
                'y': 0.2,
                'components': ['Spark Core', 'Spark SQL', 'Catalyst Optimizer']
            }
        }
        
        # 绘制层次和组件
        for layer_name, layer_info in layers.items():
            y = layer_info['y']
            components = layer_info['components']
            
            # 绘制层次标签
            ax.text(0.05, y, layer_name, fontsize=12, fontweight='bold',
                   bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.7))
            
            # 绘制组件
            x_step = 0.8 / len(components)
            for i, component in enumerate(components):
                x = 0.15 + i * x_step
                ax.text(x, y, component, ha='center', va='center', fontsize=10,
                       bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.7))
        
        # 绘制连接线
        for i in range(len(layers)-1):
            y1 = list(layers.values())[i]['y'] - 0.05
            y2 = list(layers.values())[i+1]['y'] + 0.05
            ax.plot([0.5, 0.5], [y1, y2], 'k--', alpha=0.5)
        
        ax.set_xlim(0, 1)
        ax.set_ylim(0, 1)
        ax.set_title('Spark MLlib架构图', fontsize=16, fontweight='bold')
        ax.axis('off')
        
        plt.tight_layout()
        plt.show()
    
    def cleanup(self):
        """
        清理资源
        """
        self.spark.stop()

# 使用示例
if __name__ == "__main__":
    # 创建MLlib架构演示实例
    mllib_arch = SparkMLlibArchitecture()
    
    try:
        # 演示MLlib核心组件
        component_results = mllib_arch.demonstrate_mllib_components()
        
        # 演示特征工程
        feature_results = mllib_arch.demonstrate_feature_engineering()
        
        # 演示模型评估
        evaluation_results = mllib_arch.demonstrate_model_evaluation()
        
        # 演示超参数调优
        tuning_results = mllib_arch.demonstrate_hyperparameter_tuning()
        
        # 合并结果
        all_results = {
            **component_results,
            **feature_results,
            **evaluation_results,
            **tuning_results
        }
        
        # 可视化架构
        mllib_arch.visualize_mllib_architecture(all_results)
        
    finally:
        # 清理资源
        mllib_arch.cleanup()

5.1.3 数据预处理基础

数据预处理是机器学习项目中最重要的步骤之一,直接影响模型的性能。

数据清洗和转换

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.ml.stat import Correlation
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Any

class DataPreprocessing:
    """
    数据预处理和特征工程
    """
    
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("DataPreprocessing") \
            .config("spark.sql.adaptive.enabled", "true") \
            .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
            .getOrCreate()
        
        self.spark.sparkContext.setLogLevel("WARN")
    
    def create_sample_dataset(self):
        """
        创建包含各种数据质量问题的示例数据集
        """
        print("=== 创建示例数据集 ===")
        
        # 创建包含缺失值、异常值、重复值的数据
        data = [
            (1, "Alice", 25, "Engineer", 75000.0, "2020-01-15", "alice@email.com"),
            (2, "Bob", None, "Manager", 85000.0, "2019-03-20", "bob@email.com"),
            (3, "Charlie", 35, "Director", None, "2018-05-10", "charlie@email.com"),
            (4, "David", 28, "Engineer", 70000.0, "2021-02-28", None),
            (5, "Eve", 32, "Manager", 80000.0, "2020-07-12", "eve@email.com"),
            (6, "Frank", 150, "Director", 100000.0, "2017-11-05", "frank@email.com"),  # 异常年龄
            (7, "Grace", 26, "Engineer", 72000.0, "2021-09-18", "grace@email.com"),
            (8, "Henry", 38, "Manager", 88000.0, "2019-12-03", "henry@email.com"),
            (1, "Alice", 25, "Engineer", 75000.0, "2020-01-15", "alice@email.com"),  # 重复记录
            (9, "Ivy", -5, "Analyst", 60000.0, "2022-01-10", "ivy@email.com"),  # 异常年龄
            (10, "Jack", 30, "", 65000.0, "2021-06-15", "jack@email.com"),  # 空字符串
            (11, "Kate", 29, "Analyst", 999999.0, "2020-11-22", "kate@email.com")  # 异常薪资
        ]
        
        schema = StructType([
            StructField("id", IntegerType(), True),
            StructField("name", StringType(), True),
            StructField("age", IntegerType(), True),
            StructField("position", StringType(), True),
            StructField("salary", DoubleType(), True),
            StructField("hire_date", StringType(), True),
            StructField("email", StringType(), True)
        ])
        
        df = self.spark.createDataFrame(data, schema)
        
        print("原始数据集:")
        df.show()
        
        print("\n数据集基本信息:")
        print(f"总行数: {df.count()}")
        print(f"总列数: {len(df.columns)}")
        
        return df
    
    def analyze_data_quality(self, df):
        """
        分析数据质量问题
        """
        print("\n=== 数据质量分析 ===")
        
        # 1. 缺失值分析
        print("\n1. 缺失值分析:")
        missing_counts = []
        for col in df.columns:
            missing_count = df.filter(col(col).isNull() | 
                                    (col(col) == "") | 
                                    (col(col) == "null")).count()
            missing_percentage = (missing_count / df.count()) * 100
            missing_counts.append((col, missing_count, missing_percentage))
            print(f"  {col}: {missing_count} ({missing_percentage:.2f}%)")
        
        # 2. 重复值分析
        print("\n2. 重复值分析:")
        total_rows = df.count()
        distinct_rows = df.distinct().count()
        duplicate_rows = total_rows - distinct_rows
        print(f"  总行数: {total_rows}")
        print(f"  唯一行数: {distinct_rows}")
        print(f"  重复行数: {duplicate_rows}")
        
        # 3. 异常值分析
        print("\n3. 异常值分析:")
        
        # 年龄异常值
        age_stats = df.select("age").describe().collect()
        print("  年龄统计:")
        for row in age_stats:
            print(f"    {row['summary']}: {row['age']}")
        
        # 检查年龄异常值
        abnormal_age = df.filter((col("age") < 18) | (col("age") > 65))
        print(f"  异常年龄记录数: {abnormal_age.count()}")
        
        # 薪资异常值
        salary_stats = df.select("salary").describe().collect()
        print("\n  薪资统计:")
        for row in salary_stats:
            print(f"    {row['summary']}: {row['salary']}")
        
        # 检查薪资异常值(使用四分位数方法)
        salary_quantiles = df.select("salary").approxQuantile("salary", [0.25, 0.75], 0.05)
        if len(salary_quantiles) == 2:
            q1, q3 = salary_quantiles
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            
            abnormal_salary = df.filter(
                (col("salary") < lower_bound) | (col("salary") > upper_bound)
            )
            print(f"  异常薪资记录数: {abnormal_salary.count()}")
            print(f"  薪资正常范围: [{lower_bound:.2f}, {upper_bound:.2f}]")
        
        return {
            'missing_counts': missing_counts,
            'duplicate_rows': duplicate_rows,
            'abnormal_age': abnormal_age.collect(),
            'abnormal_salary': abnormal_salary.collect() if 'abnormal_salary' in locals() else []
        }
    
    def clean_missing_values(self, df):
        """
        处理缺失值
        """
        print("\n=== 缺失值处理 ===")
        
        # 1. 删除缺失值
        print("\n1. 删除策略")
        
        # 删除任何列有缺失值的行
        df_drop_any = df.dropna(how='any')
        print(f"  删除任何缺失值后行数: {df_drop_any.count()}")
        
        # 删除所有列都是缺失值的行
        df_drop_all = df.dropna(how='all')
        print(f"  删除全部缺失值后行数: {df_drop_all.count()}")
        
        # 删除特定列的缺失值
        df_drop_subset = df.dropna(subset=['name', 'position'])
        print(f"  删除name或position缺失值后行数: {df_drop_subset.count()}")
        
        # 2. 填充缺失值
        print("\n2. 填充策略")
        
        # 用常数填充
        df_fill_constant = df.fillna({
            'age': 30,  # 用平均年龄填充
            'position': 'Unknown',  # 用默认值填充
            'salary': 70000.0,  # 用中位数填充
            'email': 'unknown@company.com'  # 用默认邮箱填充
        })
        
        print("  常数填充后缺失值统计:")
        for col in df_fill_constant.columns:
            missing_count = df_fill_constant.filter(col(col).isNull()).count()
            print(f"    {col}: {missing_count}")
        
        # 3. 统计填充
        print("\n3. 统计填充")
        
        # 计算统计值
        age_mean = df.select(mean("age")).collect()[0][0]
        salary_median = df.approxQuantile("salary", [0.5], 0.05)[0]
        
        # 用统计值填充
        df_fill_stats = df.fillna({
            'age': int(age_mean) if age_mean else 30,
            'salary': salary_median if salary_median else 70000.0
        })
        
        print(f"  年龄平均值: {age_mean:.2f}" if age_mean else "  年龄平均值: None")
        print(f"  薪资中位数: {salary_median:.2f}" if salary_median else "  薪资中位数: None")
        
        # 4. 前向填充和后向填充(对于时间序列数据)
        print("\n4. 前向/后向填充")
        
        # 按ID排序后进行前向填充
        from pyspark.sql.window import Window
        
        window_spec = Window.orderBy("id").rowsBetween(Window.unboundedPreceding, 0)
        
        df_forward_fill = df.withColumn(
            "age_filled",
            last("age", ignorenulls=True).over(window_spec)
        )
        
        print("  前向填充示例:")
        df_forward_fill.select("id", "name", "age", "age_filled").show()
        
        return {
            'df_drop_any': df_drop_any,
            'df_fill_constant': df_fill_constant,
            'df_fill_stats': df_fill_stats,
            'df_forward_fill': df_forward_fill
        }
    
    def handle_outliers(self, df):
        """
        处理异常值
        """
        print("\n=== 异常值处理 ===")
        
        # 1. 基于统计的异常值检测
        print("\n1. 统计方法检测异常值")
        
        # Z-score方法
        age_mean = df.select(mean("age")).collect()[0][0]
        age_std = df.select(stddev("age")).collect()[0][0]
        
        if age_mean and age_std:
            df_with_zscore = df.withColumn(
                "age_zscore",
                abs((col("age") - age_mean) / age_std)
            )
            
            # 标记异常值(|z-score| > 2)
            df_outliers_zscore = df_with_zscore.filter(col("age_zscore") > 2)
            print(f"  Z-score方法检测到的年龄异常值: {df_outliers_zscore.count()}")
            
            df_outliers_zscore.select("name", "age", "age_zscore").show()
        
        # 2. 四分位数方法(IQR)
        print("\n2. IQR方法检测异常值")
        
        # 计算四分位数
        salary_quantiles = df.approxQuantile("salary", [0.25, 0.5, 0.75], 0.05)
        
        if len(salary_quantiles) == 3:
            q1, median, q3 = salary_quantiles
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            
            print(f"  Q1: {q1:.2f}, Q3: {q3:.2f}, IQR: {iqr:.2f}")
            print(f"  正常范围: [{lower_bound:.2f}, {upper_bound:.2f}]")
            
            # 标记异常值
            df_with_outlier_flag = df.withColumn(
                "salary_outlier",
                when((col("salary") < lower_bound) | (col("salary") > upper_bound), True)
                .otherwise(False)
            )
            
            df_outliers_iqr = df_with_outlier_flag.filter(col("salary_outlier") == True)
            print(f"  IQR方法检测到的薪资异常值: {df_outliers_iqr.count()}")
            
            df_outliers_iqr.select("name", "salary", "salary_outlier").show()
        
        # 3. 异常值处理策略
        print("\n3. 异常值处理策略")
        
        # 策略1:删除异常值
        df_remove_outliers = df.filter(
            (col("age") >= 18) & (col("age") <= 65) &
            (col("salary") >= 30000) & (col("salary") <= 200000)
        )
        print(f"  删除异常值后行数: {df_remove_outliers.count()}")
        
        # 策略2:截断异常值
        df_cap_outliers = df.withColumn(
            "age_capped",
            when(col("age") < 18, 18)
            .when(col("age") > 65, 65)
            .otherwise(col("age"))
        ).withColumn(
            "salary_capped",
            when(col("salary") < 30000, 30000)
            .when(col("salary") > 200000, 200000)
            .otherwise(col("salary"))
        )
        
        print("  截断处理前后对比:")
        df_cap_outliers.select(
            "name", "age", "age_capped", "salary", "salary_capped"
        ).show()
        
        # 策略3:变换处理
        df_transform = df.withColumn(
            "salary_log",
            log(col("salary"))
        ).withColumn(
            "salary_sqrt",
            sqrt(col("salary"))
        )
        
        print("  对数和平方根变换:")
        df_transform.select("name", "salary", "salary_log", "salary_sqrt").show()
        
        return {
            'df_remove_outliers': df_remove_outliers,
            'df_cap_outliers': df_cap_outliers,
            'df_transform': df_transform,
            'outliers_zscore': df_outliers_zscore.collect() if 'df_outliers_zscore' in locals() else [],
            'outliers_iqr': df_outliers_iqr.collect() if 'df_outliers_iqr' in locals() else []
        }
    
    def handle_duplicates(self, df):
        """
        处理重复数据
        """
        print("\n=== 重复数据处理 ===")
        
        # 1. 检测重复数据
        print("\n1. 重复数据检测")
        total_rows = df.count()
        distinct_rows = df.distinct().count()
        duplicate_rows = total_rows - distinct_rows
        
        print(f"  总行数: {total_rows}")
        print(f"  唯一行数: {distinct_rows}")
        print(f"  重复行数: {duplicate_rows}")
        
        # 显示重复的记录
        if duplicate_rows > 0:
            print("\n  重复记录:")
            # 找出重复的记录
            df_with_count = df.groupBy(df.columns).count()
            duplicates = df_with_count.filter(col("count") > 1)
            duplicates.show()
        
        # 2. 基于特定列检测重复
        print("\n2. 基于关键列检测重复")
        
        # 基于ID检测重复
        id_duplicates = df.groupBy("id").count().filter(col("count") > 1)
        print(f"  ID重复数: {id_duplicates.count()}")
        
        if id_duplicates.count() > 0:
            print("  重复的ID:")
            id_duplicates.show()
            
            # 显示具体的重复记录
            duplicate_ids = [row['id'] for row in id_duplicates.collect()]
            duplicate_records = df.filter(col("id").isin(duplicate_ids)).orderBy("id")
            duplicate_records.show()
        
        # 3. 去重策略
        print("\n3. 去重策略")
        
        # 策略1:完全去重
        df_distinct = df.distinct()
        print(f"  完全去重后行数: {df_distinct.count()}")
        
        # 策略2:基于特定列去重
        df_drop_duplicates = df.dropDuplicates(['id'])
        print(f"  基于ID去重后行数: {df_drop_duplicates.count()}")
        
        # 策略3:保留最新记录(基于日期)
        from pyspark.sql.window import Window
        
        # 转换日期格式
        df_with_date = df.withColumn(
            "hire_date_parsed",
            to_date(col("hire_date"), "yyyy-MM-dd")
        )
        
        # 为每个ID的记录添加行号(按日期降序)
        window_spec = Window.partitionBy("id").orderBy(desc("hire_date_parsed"))
        df_with_row_number = df_with_date.withColumn(
            "row_number",
            row_number().over(window_spec)
        )
        
        # 保留每个ID的最新记录
        df_latest = df_with_row_number.filter(col("row_number") == 1).drop("row_number", "hire_date_parsed")
        print(f"  保留最新记录后行数: {df_latest.count()}")
        
        return {
            'df_distinct': df_distinct,
            'df_drop_duplicates': df_drop_duplicates,
            'df_latest': df_latest,
            'duplicate_count': duplicate_rows
        }
    
    def data_type_conversion(self, df):
        """
        数据类型转换
        """
        print("\n=== 数据类型转换 ===")
        
        # 1. 查看当前数据类型
        print("\n1. 当前数据类型:")
        df.printSchema()
        
        # 2. 日期类型转换
        print("\n2. 日期类型转换")
        
        df_with_date = df.withColumn(
            "hire_date_parsed",
            to_date(col("hire_date"), "yyyy-MM-dd")
        ).withColumn(
            "hire_year",
            year(col("hire_date_parsed"))
        ).withColumn(
            "hire_month",
            month(col("hire_date_parsed"))
        ).withColumn(
            "days_since_hire",
            datediff(current_date(), col("hire_date_parsed"))
        )
        
        print("  日期转换结果:")
        df_with_date.select(
            "name", "hire_date", "hire_date_parsed", 
            "hire_year", "hire_month", "days_since_hire"
        ).show()
        
        # 3. 字符串处理
        print("\n3. 字符串处理")
        
        df_string_processed = df.withColumn(
            "name_upper",
            upper(col("name"))
        ).withColumn(
            "name_lower",
            lower(col("name"))
        ).withColumn(
            "name_length",
            length(col("name"))
        ).withColumn(
            "email_domain",
            regexp_extract(col("email"), "@(.+)", 1)
        ).withColumn(
            "position_clean",
            trim(col("position"))
        )
        
        print("  字符串处理结果:")
        df_string_processed.select(
            "name", "name_upper", "name_lower", "name_length",
            "email", "email_domain", "position", "position_clean"
        ).show()
        
        # 4. 数值类型转换
        print("\n4. 数值类型转换")
        
        df_numeric = df.withColumn(
            "salary_int",
            col("salary").cast(IntegerType())
        ).withColumn(
            "salary_string",
            col("salary").cast(StringType())
        ).withColumn(
            "age_double",
            col("age").cast(DoubleType())
        )
        
        print("  数值转换结果:")
        df_numeric.select(
            "name", "salary", "salary_int", "salary_string", "age", "age_double"
        ).show()
        
        # 5. 布尔类型转换
        print("\n5. 布尔类型转换")
        
        df_boolean = df.withColumn(
            "is_senior",
            col("age") > 35
        ).withColumn(
            "high_salary",
            col("salary") > 80000
        ).withColumn(
            "has_email",
            col("email").isNotNull()
        )
        
        print("  布尔转换结果:")
        df_boolean.select(
            "name", "age", "is_senior", "salary", "high_salary", "email", "has_email"
        ).show()
        
        return {
            'df_with_date': df_with_date,
            'df_string_processed': df_string_processed,
            'df_numeric': df_numeric,
            'df_boolean': df_boolean
        }
    
    def feature_engineering(self, df):
        """
        特征工程
        """
        print("\n=== 特征工程 ===")
        
        # 首先进行数据清洗
        df_clean = df.dropna().distinct()
        
        # 1. 数值特征工程
        print("\n1. 数值特征工程")
        
        df_numeric_features = df_clean.withColumn(
            "salary_log",
            log(col("salary"))
        ).withColumn(
            "age_squared",
            col("age") * col("age")
        ).withColumn(
            "salary_per_age",
            col("salary") / col("age")
        ).withColumn(
            "age_group",
            when(col("age") < 30, "Young")
            .when(col("age") < 40, "Middle")
            .otherwise("Senior")
        )
        
        print("  数值特征工程结果:")
        df_numeric_features.select(
            "name", "age", "salary", "salary_log", "age_squared", 
            "salary_per_age", "age_group"
        ).show()
        
        # 2. 分类特征工程
        print("\n2. 分类特征工程")
        
        # 字符串索引化
        position_indexer = StringIndexer(
            inputCol="position",
            outputCol="position_index"
        )
        
        age_group_indexer = StringIndexer(
            inputCol="age_group",
            outputCol="age_group_index"
        )
        
        # 独热编码
        position_encoder = OneHotEncoder(
            inputCol="position_index",
            outputCol="position_vec"
        )
        
        age_group_encoder = OneHotEncoder(
            inputCol="age_group_index",
            outputCol="age_group_vec"
        )
        
        # 应用转换
        df_indexed = position_indexer.fit(df_numeric_features).transform(df_numeric_features)
        df_indexed = age_group_indexer.fit(df_indexed).transform(df_indexed)
        df_encoded = position_encoder.fit(df_indexed).transform(df_indexed)
        df_encoded = age_group_encoder.fit(df_encoded).transform(df_encoded)
        
        print("  分类特征工程结果:")
        df_encoded.select(
            "name", "position", "position_index", "position_vec",
            "age_group", "age_group_index", "age_group_vec"
        ).show(truncate=False)
        
        # 3. 特征组合
        print("\n3. 特征组合")
        
        # 组装数值特征
        numeric_assembler = VectorAssembler(
            inputCols=["age", "salary", "salary_log", "age_squared", "salary_per_age"],
            outputCol="numeric_features"
        )
        
        df_numeric_assembled = numeric_assembler.transform(df_encoded)
        
        # 组装所有特征
        feature_assembler = VectorAssembler(
            inputCols=["numeric_features", "position_vec", "age_group_vec"],
            outputCol="features"
        )
        
        df_features = feature_assembler.transform(df_numeric_assembled)
        
        print("  特征组合结果:")
        df_features.select("name", "features").show(truncate=False)
        
        # 4. 特征标准化
        print("\n4. 特征标准化")
        
        # 标准化
        scaler = StandardScaler(
            inputCol="features",
            outputCol="scaled_features",
            withStd=True,
            withMean=True
        )
        
        scaler_model = scaler.fit(df_features)
        df_scaled = scaler_model.transform(df_features)
        
        print("  特征标准化结果:")
        df_scaled.select("name", "features", "scaled_features").show(truncate=False)
        
        # 5. 特征选择
        print("\n5. 特征选择")
        
        # 创建目标变量(高薪标签)
        df_with_label = df_scaled.withColumn(
            "high_salary_label",
            (col("salary") > 80000).cast("double")
        )
        
        # 卡方检验特征选择
        selector = ChiSqSelector(
            featuresCol="scaled_features",
            outputCol="selected_features",
            labelCol="high_salary_label",
            numTopFeatures=3
        )
        
        selector_model = selector.fit(df_with_label)
        df_selected = selector_model.transform(df_with_label)
        
        print("  特征选择结果:")
        df_selected.select("name", "scaled_features", "selected_features").show(truncate=False)
        
        return {
            'df_numeric_features': df_numeric_features,
            'df_encoded': df_encoded,
            'df_features': df_features,
            'df_scaled': df_scaled,
            'df_selected': df_selected
        }
    
    def visualize_preprocessing_results(self, results):
        """
        可视化数据预处理结果
        """
        print("\n=== 数据预处理结果可视化 ===")
        
        import matplotlib.pyplot as plt
        import seaborn as sns
        import pandas as pd
        
        # 设置中文字体
        plt.rcParams['font.sans-serif'] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 创建图形
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('数据预处理结果可视化', fontsize=16, fontweight='bold')
        
        # 1. 缺失值分析
        if 'missing_analysis' in results:
            missing_data = results['missing_analysis']
            missing_df = pd.DataFrame([
                {'列名': col, '缺失数量': count, '缺失百分比': percent}
                for col, count, percent in missing_data
            ])
            
            axes[0, 0].bar(missing_df['列名'], missing_df['缺失百分比'])
            axes[0, 0].set_title('缺失值分析')
            axes[0, 0].set_ylabel('缺失百分比 (%)')
            axes[0, 0].tick_params(axis='x', rotation=45)
        
        # 2. 异常值检测结果
        if 'outliers_zscore' in results and results['outliers_zscore']:
            outlier_ages = [row['age'] for row in results['outliers_zscore']]
            outlier_salaries = [row['salary'] for row in results['outliers_zscore']]
            
            axes[0, 1].scatter(outlier_ages, outlier_salaries, color='red', alpha=0.7)
            axes[0, 1].set_title('异常值检测 (Z-Score)')
            axes[0, 1].set_xlabel('年龄')
            axes[0, 1].set_ylabel('薪资')
        
        # 3. 数据类型分布
        type_counts = {'数值型': 3, '字符串型': 4, '日期型': 1, '布尔型': 3}
        axes[0, 2].pie(type_counts.values(), labels=type_counts.keys(), autopct='%1.1f%%')
        axes[0, 2].set_title('数据类型分布')
        
        # 4. 特征工程前后对比
        original_features = ['age', 'salary', 'position']
        engineered_features = ['age', 'salary', 'salary_log', 'age_squared', 
                             'salary_per_age', 'position_vec', 'age_group_vec']
        
        feature_comparison = pd.DataFrame({
            '阶段': ['原始特征', '工程特征'],
            '特征数量': [len(original_features), len(engineered_features)]
        })
        
        axes[1, 0].bar(feature_comparison['阶段'], feature_comparison['特征数量'])
        axes[1, 0].set_title('特征工程前后对比')
        axes[1, 0].set_ylabel('特征数量')
        
        # 5. 数据处理流程
        process_steps = ['原始数据', '缺失值处理', '异常值处理', '类型转换', '特征工程', '标准化']
        process_counts = [1000, 950, 920, 920, 920, 920]  # 示例数据量
        
        axes[1, 1].plot(process_steps, process_counts, marker='o', linewidth=2, markersize=8)
        axes[1, 1].set_title('数据处理流程')
        axes[1, 1].set_ylabel('数据量')
        axes[1, 1].tick_params(axis='x', rotation=45)
        
        # 6. 处理效果总结
        effect_data = {
            '处理类型': ['缺失值', '重复值', '异常值', '特征数量'],
            '处理前': [50, 20, 15, 3],
            '处理后': [0, 0, 5, 7]
        }
        
        x = range(len(effect_data['处理类型']))
        width = 0.35
        
        axes[1, 2].bar([i - width/2 for i in x], effect_data['处理前'], 
                      width, label='处理前', alpha=0.8)
        axes[1, 2].bar([i + width/2 for i in x], effect_data['处理后'], 
                      width, label='处理后', alpha=0.8)
        
        axes[1, 2].set_title('处理效果对比')
        axes[1, 2].set_ylabel('数量')
        axes[1, 2].set_xticks(x)
        axes[1, 2].set_xticklabels(effect_data['处理类型'])
        axes[1, 2].legend()
        
        plt.tight_layout()
        plt.show()
        
        # 打印处理总结
        print("\n=== 数据预处理总结 ===")
        print("1. 数据质量分析:")
        print("   - 检测并处理了缺失值")
        print("   - 识别并处理了重复数据")
        print("   - 检测并处理了异常值")
        
        print("\n2. 数据类型转换:")
        print("   - 日期类型转换和特征提取")
        print("   - 字符串处理和清洗")
        print("   - 数值类型优化")
        print("   - 布尔类型转换")
        
        print("\n3. 特征工程:")
        print("   - 数值特征变换(对数、平方、比率)")
        print("   - 分类特征编码(索引化、独热编码)")
        print("   - 特征组合和向量化")
        print("   - 特征标准化")
        print("   - 特征选择")
        
        print("\n4. 处理效果:")
        print("   - 数据质量显著提升")
        print("   - 特征表达能力增强")
        print("   - 为机器学习模型做好准备")

# 演示数据预处理
if __name__ == "__main__":
    # 创建数据预处理器
    preprocessor = DataPreprocessor()
    
    # 创建示例数据
    sample_data = [
        (1, "Alice", 25, 75000, "Engineer", "alice@email.com", "2020-01-15"),
        (2, "Bob", 30, 85000, "Manager", "bob@email.com", "2019-03-20"),
        (3, "Charlie", 35, 95000, "Engineer", None, "2018-06-10"),
        (4, "David", None, 70000, "Analyst", "david@email.com", "2021-02-28"),
        (5, "Eve", 28, None, "Designer", "eve@email.com", "2020-08-05"),
        (6, "Frank", 45, 120000, "Director", "frank@email.com", "2017-11-12"),
        (7, "Grace", 32, 88000, "Manager", "grace@email.com", "2019-09-18"),
        (8, "Henry", 29, 78000, "Engineer", "henry@email.com", "2020-04-22"),
        (9, "Ivy", 150, 200000, "Engineer", "ivy@email.com", "2020-01-15"),  # 异常值
        (10, "Jack", 26, 72000, "Analyst", "jack@email.com", "2021-07-30"),
        (2, "Bob", 30, 85000, "Manager", "bob@email.com", "2019-03-20"),  # 重复数据
    ]
    
    columns = ["id", "name", "age", "salary", "position", "email", "hire_date"]
    df = preprocessor.spark.createDataFrame(sample_data, columns)
    
    print("=== 数据预处理演示 ===")
    print("\n原始数据:")
    df.show()
    
    # 执行数据预处理
    results = {}
    
    # 1. 数据质量分析
    quality_results = preprocessor.analyze_data_quality(df)
    results.update(quality_results)
    
    # 2. 缺失值处理
    missing_results = preprocessor.handle_missing_values(df)
    results.update(missing_results)
    
    # 3. 异常值处理
    outlier_results = preprocessor.handle_outliers(df)
    results.update(outlier_results)
    
    # 4. 重复数据处理
    duplicate_results = preprocessor.handle_duplicates(df)
    results.update(duplicate_results)
    
    # 5. 数据类型转换
    type_results = preprocessor.data_type_conversion(df)
    results.update(type_results)
    
    # 6. 特征工程
    feature_results = preprocessor.feature_engineering(df)
    results.update(feature_results)
    
    # 7. 可视化结果
    preprocessor.visualize_preprocessing_results(results)
    
    # 停止Spark会话
    preprocessor.spark.stop()

3.3 分类算法

分类是监督学习的重要分支,用于预测离散的类别标签。Spark MLlib提供了多种分类算法。

3.3.1 逻辑回归

逻辑回归是最常用的分类算法之一,适用于二分类和多分类问题。

from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import numpy as np

class LogisticRegressionDemo:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("LogisticRegressionDemo") \
            .config("spark.sql.adaptive.enabled", "true") \
            .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
            .getOrCreate()
        
        self.spark.sparkContext.setLogLevel("WARN")
    
    def create_sample_data(self):
        """
        创建示例数据集
        """
        print("=== 创建示例数据集 ===")
        
        # 创建二分类数据集(信用卡欺诈检测)
        binary_data = [
            (1, 25, 50000, 3, 0.2, 0),  # 年龄, 收入, 信用历史月数, 债务收入比, 标签
            (2, 35, 75000, 5, 0.1, 0),
            (3, 45, 90000, 8, 0.15, 0),
            (4, 28, 45000, 2, 0.8, 1),  # 欺诈
            (5, 55, 120000, 10, 0.05, 0),
            (6, 22, 30000, 1, 0.9, 1),  # 欺诈
            (7, 38, 80000, 6, 0.12, 0),
            (8, 42, 95000, 7, 0.18, 0),
            (9, 26, 35000, 2, 0.85, 1),  # 欺诈
            (10, 48, 110000, 9, 0.08, 0),
            (11, 31, 60000, 4, 0.25, 0),
            (12, 29, 40000, 2, 0.75, 1),  # 欺诈
            (13, 52, 130000, 12, 0.06, 0),
            (14, 24, 28000, 1, 0.95, 1),  # 欺诈
            (15, 39, 85000, 6, 0.14, 0)
        ]
        
        binary_columns = ["id", "age", "income", "credit_history_months", "debt_to_income", "is_fraud"]
        self.binary_df = self.spark.createDataFrame(binary_data, binary_columns)
        
        print("\n二分类数据集(信用卡欺诈检测):")
        self.binary_df.show()
        
        # 创建多分类数据集(客户分类)
        multi_data = [
            (1, 25, 50000, "Bachelor", "Engineer", "Bronze"),
            (2, 35, 75000, "Master", "Manager", "Silver"),
            (3, 45, 90000, "PhD", "Director", "Gold"),
            (4, 28, 45000, "Bachelor", "Analyst", "Bronze"),
            (5, 55, 120000, "Master", "Executive", "Gold"),
            (6, 22, 30000, "High School", "Assistant", "Bronze"),
            (7, 38, 80000, "Master", "Manager", "Silver"),
            (8, 42, 95000, "Bachelor", "Senior Engineer", "Silver"),
            (9, 26, 35000, "Bachelor", "Junior", "Bronze"),
            (10, 48, 110000, "PhD", "Director", "Gold"),
            (11, 31, 60000, "Master", "Engineer", "Silver"),
            (12, 29, 40000, "Bachelor", "Analyst", "Bronze"),
            (13, 52, 130000, "PhD", "VP", "Gold"),
            (14, 24, 28000, "High School", "Intern", "Bronze"),
            (15, 39, 85000, "Master", "Manager", "Silver")
        ]
        
        multi_columns = ["id", "age", "income", "education", "position", "customer_tier"]
        self.multi_df = self.spark.createDataFrame(multi_data, multi_columns)
        
        print("\n多分类数据集(客户分类):")
        self.multi_df.show()
        
        return self.binary_df, self.multi_df
    
    def binary_classification(self, df):
        """
        二分类逻辑回归
        """
        print("\n=== 二分类逻辑回归 ===")
        
        # 1. 特征工程
        print("\n1. 特征工程")
        
        # 组装特征向量
        feature_cols = ["age", "income", "credit_history_months", "debt_to_income"]
        assembler = VectorAssembler(
            inputCols=feature_cols,
            outputCol="features"
        )
        
        df_features = assembler.transform(df)
        
        print("  特征向量:")
        df_features.select("features", "is_fraud").show(truncate=False)
        
        # 2. 数据分割
        print("\n2. 数据分割")
        train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)
        
        print(f"  训练集大小: {train_data.count()}")
        print(f"  测试集大小: {test_data.count()}")
        
        # 3. 模型训练
        print("\n3. 模型训练")
        
        # 创建逻辑回归模型
        lr = LogisticRegression(
            featuresCol="features",
            labelCol="is_fraud",
            maxIter=100,
            regParam=0.01,
            elasticNetParam=0.0
        )
        
        # 训练模型
        lr_model = lr.fit(train_data)
        
        print("  模型训练完成")
        print(f"  系数: {lr_model.coefficients}")
        print(f"  截距: {lr_model.intercept}")
        
        # 4. 模型预测
        print("\n4. 模型预测")
        
        # 在测试集上预测
        predictions = lr_model.transform(test_data)
        
        print("  预测结果:")
        predictions.select(
            "age", "income", "debt_to_income", "is_fraud", 
            "prediction", "probability"
        ).show(truncate=False)
        
        # 5. 模型评估
        print("\n5. 模型评估")
        
        # 二分类评估器
        binary_evaluator = BinaryClassificationEvaluator(
            labelCol="is_fraud",
            rawPredictionCol="rawPrediction",
            metricName="areaUnderROC"
        )
        
        # 多分类评估器
        multi_evaluator = MulticlassClassificationEvaluator(
            labelCol="is_fraud",
            predictionCol="prediction"
        )
        
        # 计算评估指标
        auc = binary_evaluator.evaluate(predictions)
        accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
        precision = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedPrecision"})
        recall = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedRecall"})
        f1 = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "f1"})
        
        print(f"  AUC: {auc:.4f}")
        print(f"  准确率: {accuracy:.4f}")
        print(f"  精确率: {precision:.4f}")
        print(f"  召回率: {recall:.4f}")
        print(f"  F1分数: {f1:.4f}")
        
        # 6. 混淆矩阵
        print("\n6. 混淆矩阵")
        
        confusion_matrix = predictions.groupBy("is_fraud", "prediction").count().orderBy("is_fraud", "prediction")
        print("  混淆矩阵:")
        confusion_matrix.show()
        
        return {
            'model': lr_model,
            'predictions': predictions,
            'metrics': {
                'auc': auc,
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1
            }
        }
    
    def multiclass_classification(self, df):
        """
        多分类逻辑回归
        """
        print("\n=== 多分类逻辑回归 ===")
        
        # 1. 特征工程
        print("\n1. 特征工程")
        
        # 字符串索引化
        education_indexer = StringIndexer(
            inputCol="education",
            outputCol="education_index"
        )
        
        position_indexer = StringIndexer(
            inputCol="position",
            outputCol="position_index"
        )
        
        label_indexer = StringIndexer(
            inputCol="customer_tier",
            outputCol="label"
        )
        
        # 组装特征向量
        assembler = VectorAssembler(
            inputCols=["age", "income", "education_index", "position_index"],
            outputCol="features"
        )
        
        # 创建Pipeline
        pipeline = Pipeline(stages=[
            education_indexer,
            position_indexer,
            label_indexer,
            assembler
        ])
        
        # 应用Pipeline
        df_processed = pipeline.fit(df).transform(df)
        
        print("  处理后的数据:")
        df_processed.select(
            "age", "income", "education", "education_index",
            "position", "position_index", "customer_tier", "label", "features"
        ).show(truncate=False)
        
        # 2. 数据分割
        print("\n2. 数据分割")
        train_data, test_data = df_processed.randomSplit([0.8, 0.2], seed=42)
        
        print(f"  训练集大小: {train_data.count()}")
        print(f"  测试集大小: {test_data.count()}")
        
        # 3. 模型训练
        print("\n3. 模型训练")
        
        # 创建多分类逻辑回归模型
        mlr = LogisticRegression(
            featuresCol="features",
            labelCol="label",
            maxIter=100,
            regParam=0.01,
            elasticNetParam=0.0,
            family="multinomial"
        )
        
        # 训练模型
        mlr_model = mlr.fit(train_data)
        
        print("  模型训练完成")
        print(f"  系数矩阵形状: {mlr_model.coefficientMatrix.numRows} x {mlr_model.coefficientMatrix.numCols}")
        print(f"  截距向量: {mlr_model.interceptVector}")
        
        # 4. 模型预测
        print("\n4. 模型预测")
        
        # 在测试集上预测
        predictions = mlr_model.transform(test_data)
        
        print("  预测结果:")
        predictions.select(
            "age", "income", "education", "position", 
            "customer_tier", "label", "prediction", "probability"
        ).show(truncate=False)
        
        # 5. 模型评估
        print("\n5. 模型评估")
        
        # 多分类评估器
        evaluator = MulticlassClassificationEvaluator(
            labelCol="label",
            predictionCol="prediction"
        )
        
        # 计算评估指标
        accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
        precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
        recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
        f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
        
        print(f"  准确率: {accuracy:.4f}")
        print(f"  加权精确率: {precision:.4f}")
        print(f"  加权召回率: {recall:.4f}")
        print(f"  加权F1分数: {f1:.4f}")
        
        # 6. 混淆矩阵
        print("\n6. 混淆矩阵")
        
        confusion_matrix = predictions.groupBy("label", "prediction").count().orderBy("label", "prediction")
        print("  混淆矩阵:")
        confusion_matrix.show()
        
        # 7. 各类别详细指标
        print("\n7. 各类别详细指标")
        
        # 计算每个类别的精确率、召回率和F1分数
        labels = [0.0, 1.0, 2.0]  # Bronze, Silver, Gold
        label_names = ["Bronze", "Silver", "Gold"]
        
        for i, (label, name) in enumerate(zip(labels, label_names)):
            tp = predictions.filter((col("label") == label) & (col("prediction") == label)).count()
            fp = predictions.filter((col("label") != label) & (col("prediction") == label)).count()
            fn = predictions.filter((col("label") == label) & (col("prediction") != label)).count()
            
            precision_class = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall_class = tp / (tp + fn) if (tp + fn) > 0 else 0
            f1_class = 2 * precision_class * recall_class / (precision_class + recall_class) if (precision_class + recall_class) > 0 else 0
            
            print(f"  {name} 类别:")
            print(f"    精确率: {precision_class:.4f}")
            print(f"    召回率: {recall_class:.4f}")
            print(f"    F1分数: {f1_class:.4f}")
        
        return {
            'model': mlr_model,
            'predictions': predictions,
            'pipeline': pipeline,
            'metrics': {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1
            }
        }
    
    def hyperparameter_tuning(self, df):
        """
        超参数调优
        """
        print("\n=== 超参数调优 ===")
        
        from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
        
        # 1. 准备数据
        feature_cols = ["age", "income", "credit_history_months", "debt_to_income"]
        assembler = VectorAssembler(
            inputCols=feature_cols,
            outputCol="features"
        )
        
        df_features = assembler.transform(df)
        train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)
        
        # 2. 创建逻辑回归模型
        lr = LogisticRegression(
            featuresCol="features",
            labelCol="is_fraud"
        )
        
        # 3. 参数网格
        param_grid = ParamGridBuilder() \
            .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
            .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
            .addGrid(lr.maxIter, [50, 100]) \
            .build()
        
        print(f"  参数组合数量: {len(param_grid)}")
        
        # 4. 交叉验证
        evaluator = BinaryClassificationEvaluator(
            labelCol="is_fraud",
            rawPredictionCol="rawPrediction",
            metricName="areaUnderROC"
        )
        
        cv = CrossValidator(
            estimator=lr,
            estimatorParamMaps=param_grid,
            evaluator=evaluator,
            numFolds=3,
            seed=42
        )
        
        # 5. 训练和选择最佳模型
        print("\n  开始交叉验证...")
        cv_model = cv.fit(train_data)
        
        # 6. 最佳参数
        best_model = cv_model.bestModel
        print("\n  最佳参数:")
        print(f"    regParam: {best_model.getRegParam()}")
        print(f"    elasticNetParam: {best_model.getElasticNetParam()}")
        print(f"    maxIter: {best_model.getMaxIter()}")
        
        # 7. 在测试集上评估
        predictions = cv_model.transform(test_data)
        auc = evaluator.evaluate(predictions)
        print(f"\n  测试集AUC: {auc:.4f}")
        
        # 8. 交叉验证结果
        print("\n  交叉验证平均分数:")
        avg_metrics = cv_model.avgMetrics
        for i, score in enumerate(avg_metrics):
            print(f"    参数组合 {i+1}: {score:.4f}")
        
        return {
            'best_model': best_model,
            'cv_model': cv_model,
            'test_auc': auc,
            'avg_metrics': avg_metrics
        }
    
    def visualize_results(self, binary_results, multi_results, tuning_results):
        """
        可视化逻辑回归结果
        """
        print("\n=== 逻辑回归结果可视化 ===")
        
        import matplotlib.pyplot as plt
        import seaborn as sns
        import pandas as pd
        import numpy as np
        
        # 设置中文字体
        plt.rcParams['font.sans-serif'] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 创建图形
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('逻辑回归分析结果', fontsize=16, fontweight='bold')
        
        # 1. 二分类性能指标
        binary_metrics = binary_results['metrics']
        metrics_names = list(binary_metrics.keys())
        metrics_values = list(binary_metrics.values())
        
        axes[0, 0].bar(metrics_names, metrics_values, color=['skyblue', 'lightgreen', 'lightcoral', 'gold', 'plum'])
        axes[0, 0].set_title('二分类性能指标')
        axes[0, 0].set_ylabel('分数')
        axes[0, 0].set_ylim(0, 1)
        axes[0, 0].tick_params(axis='x', rotation=45)
        
        # 添加数值标签
        for i, v in enumerate(metrics_values):
            axes[0, 0].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
        
        # 2. 多分类性能指标
        multi_metrics = multi_results['metrics']
        multi_names = list(multi_metrics.keys())
        multi_values = list(multi_metrics.values())
        
        axes[0, 1].bar(multi_names, multi_values, color=['lightblue', 'lightgreen', 'lightcoral', 'gold'])
        axes[0, 1].set_title('多分类性能指标')
        axes[0, 1].set_ylabel('分数')
        axes[0, 1].set_ylim(0, 1)
        axes[0, 1].tick_params(axis='x', rotation=45)
        
        # 添加数值标签
        for i, v in enumerate(multi_values):
            axes[0, 1].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
        
        # 3. 超参数调优结果
        tuning_metrics = tuning_results['avg_metrics']
        param_combinations = [f'组合{i+1}' for i in range(len(tuning_metrics))]
        
        axes[0, 2].plot(param_combinations, tuning_metrics, marker='o', linewidth=2, markersize=8)
        axes[0, 2].set_title('超参数调优结果')
        axes[0, 2].set_ylabel('交叉验证AUC')
        axes[0, 2].tick_params(axis='x', rotation=45)
        axes[0, 2].grid(True, alpha=0.3)
        
        # 标记最佳点
        best_idx = np.argmax(tuning_metrics)
        axes[0, 2].scatter(best_idx, tuning_metrics[best_idx], color='red', s=100, zorder=5)
        axes[0, 2].annotate(f'最佳: {tuning_metrics[best_idx]:.3f}', 
                           xy=(best_idx, tuning_metrics[best_idx]),
                           xytext=(10, 10), textcoords='offset points',
                           bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7),
                           arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))
        
        # 4. 算法比较
        algorithms = ['二分类逻辑回归', '多分类逻辑回归']
        accuracies = [binary_metrics['accuracy'], multi_metrics['accuracy']]
        f1_scores = [binary_metrics['f1'], multi_metrics['f1']]
        
        x = np.arange(len(algorithms))
        width = 0.35
        
        axes[1, 0].bar(x - width/2, accuracies, width, label='准确率', alpha=0.8)
        axes[1, 0].bar(x + width/2, f1_scores, width, label='F1分数', alpha=0.8)
        
        axes[1, 0].set_title('算法性能比较')
        axes[1, 0].set_ylabel('分数')
        axes[1, 0].set_xticks(x)
        axes[1, 0].set_xticklabels(algorithms)
        axes[1, 0].legend()
        axes[1, 0].set_ylim(0, 1)
        
        # 5. 特征重要性(二分类)
        feature_names = ['年龄', '收入', '信用历史', '债务收入比']
        coefficients = binary_results['model'].coefficients.toArray()
        
        # 取绝对值表示重要性
        importance = np.abs(coefficients)
        
        axes[1, 1].barh(feature_names, importance)
        axes[1, 1].set_title('特征重要性(二分类)')
        axes[1, 1].set_xlabel('系数绝对值')
        
        # 6. 模型复杂度分析
        reg_params = [0.01, 0.1, 1.0]
        model_complexity = [1/r for r in reg_params]  # 正则化参数越小,模型越复杂
        performance = [0.85, 0.82, 0.78]  # 示例性能数据
        
        axes[1, 2].plot(model_complexity, performance, marker='o', linewidth=2, markersize=8)
        axes[1, 2].set_title('模型复杂度 vs 性能')
        axes[1, 2].set_xlabel('模型复杂度 (1/regParam)')
        axes[1, 2].set_ylabel('性能 (AUC)')
        axes[1, 2].grid(True, alpha=0.3)
        
        # 标注正则化参数
        for i, (complexity, perf, reg) in enumerate(zip(model_complexity, performance, reg_params)):
            axes[1, 2].annotate(f'λ={reg}', xy=(complexity, perf), 
                               xytext=(5, 5), textcoords='offset points')
        
        plt.tight_layout()
        plt.show()
        
        # 打印总结
        print("\n=== 逻辑回归分析总结 ===")
        print("1. 二分类结果:")
        print(f"   - AUC: {binary_metrics['auc']:.4f}")
        print(f"   - 准确率: {binary_metrics['accuracy']:.4f}")
        print(f"   - F1分数: {binary_metrics['f1']:.4f}")
        
        print("\n2. 多分类结果:")
        print(f"   - 准确率: {multi_metrics['accuracy']:.4f}")
        print(f"   - 加权F1分数: {multi_metrics['f1']:.4f}")
        
        print("\n3. 超参数调优:")
        best_model = tuning_results['best_model']
        print(f"   - 最佳正则化参数: {best_model.getRegParam()}")
        print(f"   - 最佳弹性网络参数: {best_model.getElasticNetParam()}")
        print(f"   - 测试集AUC: {tuning_results['test_auc']:.4f}")
        
        print("\n4. 模型特点:")
        print("   - 逻辑回归适用于线性可分问题")
        print("   - 支持二分类和多分类")
        print("   - 具有良好的可解释性")
        print("   - 训练速度快,内存占用少")
        print("   - 对特征缩放敏感")

# 演示逻辑回归
if __name__ == "__main__":
    # 创建逻辑回归演示器
    lr_demo = LogisticRegressionDemo()
    
    # 创建示例数据
    binary_df, multi_df = lr_demo.create_sample_data()
    
    print("=== 逻辑回归演示 ===")
    
    # 1. 二分类演示
    binary_results = lr_demo.binary_classification(binary_df)
    
    # 2. 多分类演示
    multi_results = lr_demo.multiclass_classification(multi_df)
    
    # 3. 超参数调优演示
    tuning_results = lr_demo.hyperparameter_tuning(binary_df)
    
    # 4. 结果可视化
    lr_demo.visualize_results(binary_results, multi_results, tuning_results)
    
    # 停止Spark会话
    lr_demo.spark.stop()

3.3.2 决策树

决策树是一种直观的分类算法,通过一系列规则来进行决策。

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.sql.functions import *
import matplotlib.pyplot as plt

class DecisionTreeDemo:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("DecisionTreeDemo") \
            .config("spark.sql.adaptive.enabled", "true") \
            .getOrCreate()
        
        self.spark.sparkContext.setLogLevel("WARN")
    
    def create_sample_data(self):
        """
        创建示例数据集(员工绩效评估)
        """
        print("=== 创建员工绩效评估数据集 ===")
        
        data = [
            (1, 25, "Bachelor", 2, 85, 7.5, "Good"),
            (2, 30, "Master", 5, 92, 8.2, "Excellent"),
            (3, 35, "PhD", 8, 78, 6.8, "Average"),
            (4, 28, "Bachelor", 3, 88, 7.8, "Good"),
            (5, 45, "Master", 12, 95, 9.1, "Excellent"),
            (6, 22, "Bachelor", 1, 72, 6.2, "Average"),
            (7, 38, "PhD", 10, 89, 8.5, "Good"),
            (8, 42, "Master", 15, 96, 9.3, "Excellent"),
            (9, 26, "Bachelor", 2, 75, 6.5, "Average"),
            (10, 33, "Master", 7, 91, 8.0, "Good"),
            (11, 29, "Bachelor", 4, 86, 7.6, "Good"),
            (12, 50, "PhD", 20, 93, 8.8, "Excellent"),
            (13, 24, "Bachelor", 1, 70, 6.0, "Average"),
            (14, 36, "Master", 9, 87, 7.9, "Good"),
            (15, 41, "PhD", 13, 94, 9.0, "Excellent"),
            (16, 27, "Bachelor", 3, 79, 6.9, "Average"),
            (17, 39, "Master", 11, 90, 8.3, "Good"),
            (18, 31, "Bachelor", 5, 84, 7.4, "Good"),
            (19, 46, "PhD", 16, 97, 9.5, "Excellent"),
            (20, 23, "Bachelor", 1, 68, 5.8, "Average")
        ]
        
        columns = ["id", "age", "education", "experience", "performance_score", "rating", "performance_level"]
        self.df = self.spark.createDataFrame(data, columns)
        
        print("\n员工绩效数据:")
        self.df.show()
        
        # 数据统计
        print("\n绩效等级分布:")
        self.df.groupBy("performance_level").count().orderBy("performance_level").show()
        
        return self.df
    
    def train_kmeans(self, df, k=5):
        """
        训练K-Means聚类模型
        """
        print(f"\n=== K-Means聚类模型训练 (K={k}) ===")
        
        # 1. 特征工程
        print("\n1. 特征工程")
        
        feature_cols = [
            "age", "income", "spending_score", "purchase_frequency",
            "loyalty_score", "online_ratio", "customer_lifetime",
            "avg_order_value", "return_rate", "recommendation_acceptance"
        ]
        
        # 特征向量化
        assembler = VectorAssembler(
            inputCols=feature_cols,
            outputCol="features_raw"
        )
        
        df_features = assembler.transform(df)
        
        # 特征标准化(K-Means对特征尺度敏感)
        scaler = StandardScaler(
            inputCol="features_raw",
            outputCol="features",
            withStd=True,
            withMean=True
        )
        
        scaler_model = scaler.fit(df_features)
        df_scaled = scaler_model.transform(df_features)
        
        print("  特征工程完成")
        print(f"  特征维度: {len(feature_cols)}")
        print("  已进行标准化处理")
        
        # 2. 创建K-Means模型
        print("\n2. 创建K-Means模型")
        
        kmeans = KMeans(
            featuresCol="features",
            predictionCol="cluster",
            k=k,
            seed=42,
            maxIter=100,
            tol=1e-4
        )
        
        # 3. 训练模型
        print("\n3. 训练模型")
        kmeans_model = kmeans.fit(df_scaled)
        
        print("  模型训练完成")
        print(f"  聚类数量: {k}")
        print(f"  迭代次数: {kmeans_model.summary.numIter}")
        
        # 4. 聚类预测
        print("\n4. 聚类预测")
        predictions = kmeans_model.transform(df_scaled)
        
        print("  聚类结果样例:")
        predictions.select(
            "customer_id", "age", "income", "spending_score",
            "loyalty_score", "cluster"
        ).show(10)
        
        # 5. 聚类中心
        print("\n5. 聚类中心")
        centers = kmeans_model.clusterCenters()
        
        print("  各聚类中心特征值:")
        for i, center in enumerate(centers):
            print(f"  聚类 {i}:")
            for j, feature in enumerate(feature_cols):
                print(f"    {feature}: {center[j]:.4f}")
            print()
        
        # 6. 聚类分布
        print("\n6. 聚类分布")
        cluster_counts = predictions.groupBy("cluster").count().orderBy("cluster")
        cluster_counts.show()
        
        total_count = predictions.count()
        print("  聚类比例:")
        for row in cluster_counts.collect():
            cluster_id = row['cluster']
            count = row['count']
            percentage = (count / total_count) * 100
            print(f"    聚类 {cluster_id}: {count} 个客户 ({percentage:.2f}%)")
        
        # 7. 聚类评估
        print("\n7. 聚类评估")
        
        # 计算WSSSE (Within Set Sum of Squared Errors)
        wssse = kmeans_model.summary.trainingCost
        print(f"  WSSSE (簇内平方和): {wssse:.4f}")
        
        # 计算轮廓系数
        evaluator = ClusteringEvaluator(
            featuresCol="features",
            predictionCol="cluster",
            metricName="silhouette"
        )
        
        silhouette_score = evaluator.evaluate(predictions)
        print(f"  轮廓系数: {silhouette_score:.4f}")
        
        if silhouette_score > 0.5:
            print("  ✅ 聚类质量良好")
        elif silhouette_score > 0.25:
            print("  ⚠️ 聚类质量一般")
        else:
            print("  ❌ 聚类质量较差,建议调整K值")
        
        return {
            'model': kmeans_model,
            'predictions': predictions,
            'feature_cols': feature_cols,
            'scaler_model': scaler_model,
            'metrics': {
                'wssse': wssse,
                'silhouette_score': silhouette_score,
                'num_iter': kmeans_model.summary.numIter
            },
            'cluster_centers': centers,
            'cluster_distribution': cluster_counts.collect()
        }
    
    def find_optimal_k(self, df, k_range=(2, 11)):
        """
        寻找最优的K值
        """
        print(f"\n=== 寻找最优K值 (范围: {k_range[0]}-{k_range[1]-1}) ===")
        
        # 特征工程
        feature_cols = [
            "age", "income", "spending_score", "purchase_frequency",
            "loyalty_score", "online_ratio", "customer_lifetime",
            "avg_order_value", "return_rate", "recommendation_acceptance"
        ]
        
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_raw")
        df_features = assembler.transform(df)
        
        scaler = StandardScaler(
            inputCol="features_raw",
            outputCol="features",
            withStd=True,
            withMean=True
        )
        
        scaler_model = scaler.fit(df_features)
        df_scaled = scaler_model.transform(df_features)
        
        # 测试不同的K值
        results = []
        evaluator = ClusteringEvaluator(
            featuresCol="features",
            predictionCol="cluster",
            metricName="silhouette"
        )
        
        for k in range(k_range[0], k_range[1]):
            print(f"\n  测试 K={k}")
            
            # 创建和训练模型
            kmeans = KMeans(
                featuresCol="features",
                predictionCol="cluster",
                k=k,
                seed=42,
                maxIter=100
            )
            
            model = kmeans.fit(df_scaled)
            predictions = model.transform(df_scaled)
            
            # 评估指标
            wssse = model.summary.trainingCost
            silhouette_score = evaluator.evaluate(predictions)
            
            result = {
                'k': k,
                'wssse': wssse,
                'silhouette_score': silhouette_score,
                'num_iter': model.summary.numIter
            }
            
            results.append(result)
            
            print(f"    WSSSE: {wssse:.4f}")
            print(f"    轮廓系数: {silhouette_score:.4f}")
            print(f"    迭代次数: {model.summary.numIter}")
        
        # 找到最优K值
        best_k_silhouette = max(results, key=lambda x: x['silhouette_score'])
        
        print(f"\n  最优K值分析:")
        print(f"    基于轮廓系数的最优K: {best_k_silhouette['k']}")
        print(f"    对应轮廓系数: {best_k_silhouette['silhouette_score']:.4f}")
        
        return results
    
    def analyze_clusters(self, model_results):
        """
        分析聚类结果,提供业务洞察
        """
        print("\n=== 聚类结果业务分析 ===")
        
        predictions = model_results['predictions']
        feature_cols = model_results['feature_cols']
        
        # 计算每个聚类的特征统计
        cluster_stats = []
        
        for cluster_id in range(len(model_results['cluster_centers'])):
            cluster_data = predictions.filter(col("cluster") == cluster_id)
            
            stats = {}
            stats['cluster_id'] = cluster_id
            stats['count'] = cluster_data.count()
            
            # 计算各特征的平均值
            for feature in feature_cols:
                avg_value = cluster_data.agg(avg(col(feature)).alias(f"avg_{feature}")).collect()[0][f"avg_{feature}"]
                stats[f'avg_{feature}'] = avg_value
            
            cluster_stats.append(stats)
        
        # 输出聚类特征分析
        print("\n  各聚类特征分析:")
        for stats in cluster_stats:
            cluster_id = stats['cluster_id']
            count = stats['count']
            
            print(f"\n  聚类 {cluster_id} ({count} 个客户):")
            print(f"    平均年龄: {stats['avg_age']:.1f} 岁")
            print(f"    平均收入: {stats['avg_income']:.1f} 万元")
            print(f"    平均消费评分: {stats['avg_spending_score']:.1f}")
            print(f"    平均购买频次: {stats['avg_purchase_frequency']:.1f} 次/年")
            print(f"    平均忠诚度: {stats['avg_loyalty_score']:.1f}")
            print(f"    平均在线购买比例: {stats['avg_online_ratio']:.2f}")
            print(f"    平均客户生命周期: {stats['avg_customer_lifetime']:.1f} 月")
            print(f"    平均订单价值: {stats['avg_avg_order_value']:.0f} 元")
            print(f"    平均退货率: {stats['avg_return_rate']:.3f}")
            print(f"    平均推荐接受率: {stats['avg_recommendation_acceptance']:.3f}")
        
        # 客户群体标签
        print("\n  客户群体标签建议:")
        for stats in cluster_stats:
            cluster_id = stats['cluster_id']
            
            # 基于特征值给出标签建议
            if stats['avg_income'] > 100 and stats['avg_spending_score'] > 70:
                label = "高价值客户"
                strategy = "提供VIP服务,推荐高端产品"
            elif stats['avg_loyalty_score'] > 8 and stats['avg_purchase_frequency'] > 20:
                label = "忠诚客户"
                strategy = "维护关系,推荐新品,提供专属优惠"
            elif stats['avg_age'] < 30 and stats['avg_online_ratio'] > 0.7:
                label = "年轻数字化客户"
                strategy = "移动端营销,社交媒体推广"
            elif stats['avg_return_rate'] > 0.1:
                label = "高风险客户"
                strategy = "改善产品质量,加强客服支持"
            elif stats['avg_spending_score'] < 30:
                label = "价格敏感客户"
                strategy = "提供折扣优惠,推荐性价比产品"
            else:
                label = "普通客户"
                strategy = "标准化服务,定期营销活动"
            
            print(f"    聚类 {cluster_id}: {label}")
            print(f"      营销策略: {strategy}")
        
        return cluster_stats
    
    def visualize_kmeans_results(self, model_results, k_optimization_results=None):
        """
        可视化K-Means聚类结果
        """
        import matplotlib.pyplot as plt
        import seaborn as sns
        import numpy as np
        import pandas as pd
        from sklearn.decomposition import PCA
        
        # 设置中文字体
        plt.rcParams['font.sans-serif'] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 创建图形
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('K-Means聚类分析结果', fontsize=16, fontweight='bold')
        
        predictions = model_results['predictions']
        feature_cols = model_results['feature_cols']
        
        # 转换为Pandas DataFrame进行可视化
        df_pandas = predictions.select(
            *feature_cols, "cluster"
        ).toPandas()
        
        # 1. PCA降维可视化
        features_for_pca = df_pandas[feature_cols].values
        pca = PCA(n_components=2)
        features_2d = pca.fit_transform(features_for_pca)
        
        scatter = axes[0, 0].scatter(
            features_2d[:, 0], features_2d[:, 1], 
            c=df_pandas['cluster'], cmap='viridis', alpha=0.6
        )
        axes[0, 0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} 方差)')
        axes[0, 0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} 方差)')
        axes[0, 0].set_title('PCA降维聚类可视化')
        plt.colorbar(scatter, ax=axes[0, 0])
        
        # 2. 聚类分布
        cluster_counts = df_pandas['cluster'].value_counts().sort_index()
        bars = axes[0, 1].bar(cluster_counts.index, cluster_counts.values, alpha=0.7)
        axes[0, 1].set_xlabel('聚类ID')
        axes[0, 1].set_ylabel('客户数量')
        axes[0, 1].set_title('聚类分布')
        axes[0, 1].grid(True, alpha=0.3)
        
        # 添加数值标签
        for bar, count in zip(bars, cluster_counts.values):
            height = bar.get_height()
            axes[0, 1].text(bar.get_x() + bar.get_width()/2., height,
                           f'{count}', ha='center', va='bottom')
        
        # 3. 特征重要性(基于聚类中心的方差)
        centers = np.array(model_results['cluster_centers'])
        feature_variance = np.var(centers, axis=0)
        
        sorted_idx = np.argsort(feature_variance)[::-1]
        sorted_features = [feature_cols[i] for i in sorted_idx]
        sorted_variance = feature_variance[sorted_idx]
        
        bars = axes[0, 2].bar(range(len(sorted_features)), sorted_variance, alpha=0.7)
        axes[0, 2].set_xticks(range(len(sorted_features)))
        axes[0, 2].set_xticklabels(sorted_features, rotation=45)
        axes[0, 2].set_ylabel('聚类中心方差')
        axes[0, 2].set_title('特征区分度')
        axes[0, 2].grid(True, alpha=0.3)
        
        # 4. K值优化曲线(如果有优化结果)
        if k_optimization_results:
            k_values = [r['k'] for r in k_optimization_results]
            wssse_values = [r['wssse'] for r in k_optimization_results]
            silhouette_values = [r['silhouette_score'] for r in k_optimization_results]
            
            ax1 = axes[1, 0]
            ax2 = ax1.twinx()
            
            line1 = ax1.plot(k_values, wssse_values, 'bo-', label='WSSSE', linewidth=2)
            line2 = ax2.plot(k_values, silhouette_values, 'ro-', label='轮廓系数', linewidth=2)
            
            ax1.set_xlabel('K值')
            ax1.set_ylabel('WSSSE', color='blue')
            ax2.set_ylabel('轮廓系数', color='red')
            ax1.set_title('K值优化曲线')
            ax1.grid(True, alpha=0.3)
            
            # 合并图例
            lines = line1 + line2
            labels = [l.get_label() for l in lines]
            ax1.legend(lines, labels, loc='center right')
        
        # 5. 收入vs消费评分散点图
        scatter = axes[1, 1].scatter(
            df_pandas['income'], df_pandas['spending_score'],
            c=df_pandas['cluster'], cmap='viridis', alpha=0.6
        )
        axes[1, 1].set_xlabel('收入 (万元)')
        axes[1, 1].set_ylabel('消费评分')
        axes[1, 1].set_title('收入 vs 消费评分')
        plt.colorbar(scatter, ax=axes[1, 1])
        
        # 6. 聚类中心热力图
        centers_df = pd.DataFrame(centers, columns=feature_cols)
        centers_df.index = [f'聚类{i}' for i in range(len(centers))]
        
        sns.heatmap(
            centers_df.T, annot=True, fmt='.2f', cmap='RdYlBu_r',
            ax=axes[1, 2], cbar_kws={'label': '标准化特征值'}
        )
        axes[1, 2].set_title('聚类中心热力图')
        axes[1, 2].set_xlabel('聚类')
        axes[1, 2].set_ylabel('特征')
        
        plt.tight_layout()
        plt.show()
        
        # 输出详细分析
        print("\n=== K-Means聚类模型分析报告 ===")
        metrics = model_results['metrics']
        print(f"模型性能指标:")
        print(f"  - WSSSE: {metrics['wssse']:.4f}")
        print(f"  - 轮廓系数: {metrics['silhouette_score']:.4f}")
        print(f"  - 迭代次数: {metrics['num_iter']}")
        
        print(f"\n聚类质量评估:")
        if metrics['silhouette_score'] > 0.5:
            quality = "优秀"
        elif metrics['silhouette_score'] > 0.25:
            quality = "良好"
        else:
            quality = "需要改进"
        print(f"  - 聚类质量: {quality}")
        
        print(f"\n特征区分度分析:")
        top_features = sorted(zip(feature_cols, feature_variance), 
                            key=lambda x: x[1], reverse=True)[:5]
        for feature, variance in top_features:
            print(f"  - {feature}: {variance:.4f}")
        
        if k_optimization_results:
            best_k = max(k_optimization_results, key=lambda x: x['silhouette_score'])
            print(f"\n最优K值建议:")
            print(f"  - 推荐K值: {best_k['k']}")
            print(f"  - 对应轮廓系数: {best_k['silhouette_score']:.4f}")
        
        return {
            'pca_variance_ratio': pca.explained_variance_ratio_,
            'feature_importance': dict(zip(feature_cols, feature_variance))
        }

# 演示代码
if __name__ == "__main__":
    # 创建演示实例
    demo = KMeansDemo()
    
    # 1. 创建数据集
    print("=== 创建客户细分数据集 ===")
    df = demo.create_dataset(n_samples=3000)
    
    # 2. 寻找最优K值
    k_optimization_results = demo.find_optimal_k(df, k_range=(2, 11))
    
    # 3. 训练K-Means模型(使用最优K值)
    best_k = max(k_optimization_results, key=lambda x: x['silhouette_score'])['k']
    model_results = demo.train_kmeans(df, k=best_k)
    
    # 4. 聚类分析
    cluster_stats = demo.analyze_clusters(model_results)
    
    # 5. 可视化结果
    demo.visualize_kmeans_results(model_results, k_optimization_results)
    
    print("\n=== K-Means聚类演示完成 ===")

总结:

K-Means聚类具有以下特点:

  1. 优点

    • 算法简单,计算效率高
    • 适合球形聚类
    • 结果可解释性强
    • 内存占用少
  2. 缺点

    • 需要预先指定K值
    • 对初始中心敏感
    • 假设聚类为球形
    • 对异常值敏感
  3. 适用场景

    • 客户细分
    • 市场分析
    • 图像分割
    • 数据压缩
  4. 业务建议

    • 客户细分中,收入和消费行为是关键特征
    • 建议结合业务知识确定K值
    • 定期重新训练模型适应客户行为变化

5.2 高斯混合模型 (GMM)

高斯混合模型是一种概率聚类算法,假设数据来自多个高斯分布的混合。

from pyspark.ml.clustering import GaussianMixture
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np

class GMMDemo:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("GMMDemo") \
            .getOrCreate()
    
    def create_dataset(self, n_samples=2500):
        """
        创建用户行为分析数据集
        """
        print(f"\n创建用户行为分析数据集 (样本数: {n_samples})")
        
        np.random.seed(42)
        
        # 生成用户ID
        user_ids = [f"USER_{i:05d}" for i in range(1, n_samples + 1)]
        
        # 用户行为特征
        # 日活跃时长(小时)
        daily_active_hours = np.random.gamma(2, 2, n_samples)
        daily_active_hours = np.clip(daily_active_hours, 0.1, 12)
        
        # 页面浏览数(日均)
        page_views = np.random.lognormal(3, 1, n_samples)
        page_views = np.clip(page_views, 5, 500).astype(int)
        
        # 点击率
        click_rates = np.random.beta(2, 8, n_samples)  # 大多数用户点击率较低
        
        # 转化率
        conversion_rates = np.random.beta(1, 19, n_samples)  # 转化率通常很低
        
        # 会话深度(页面/会话)
        session_depth = np.random.gamma(3, 2, n_samples)
        session_depth = np.clip(session_depth, 1, 50)
        
        # 跳出率
        bounce_rates = np.random.beta(3, 2, n_samples)
        
        # 移动端使用比例
        mobile_usage = np.random.beta(4, 2, n_samples)
        
        # 社交分享次数(月)
        social_shares = np.random.poisson(3, n_samples)
        social_shares = np.clip(social_shares, 0, 50)
        
        # 评论/评价次数(月)
        review_count = np.random.poisson(2, n_samples)
        review_count = np.clip(review_count, 0, 30)
        
        # 平均评分(1-5)
        avg_rating = np.random.normal(4, 0.8, n_samples)
        avg_rating = np.clip(avg_rating, 1, 5)
        
        # 创建DataFrame
        data = [
            (
                user_ids[i], float(daily_active_hours[i]), int(page_views[i]),
                float(click_rates[i]), float(conversion_rates[i]), float(session_depth[i]),
                float(bounce_rates[i]), float(mobile_usage[i]), int(social_shares[i]),
                int(review_count[i]), float(avg_rating[i])
            )
            for i in range(n_samples)
        ]
        
        schema = StructType([
            StructField("user_id", StringType(), True),
            StructField("daily_active_hours", DoubleType(), True),
            StructField("page_views", IntegerType(), True),
            StructField("click_rate", DoubleType(), True),
            StructField("conversion_rate", DoubleType(), True),
            StructField("session_depth", DoubleType(), True),
            StructField("bounce_rate", DoubleType(), True),
            StructField("mobile_usage", DoubleType(), True),
            StructField("social_shares", IntegerType(), True),
            StructField("review_count", IntegerType(), True),
            StructField("avg_rating", DoubleType(), True)
        ])
        
        self.df = self.spark.createDataFrame(data, schema)
        
        print(f"  生成数据集大小: {self.df.count()} 行, {len(self.df.columns)} 列")
        
        # 显示数据概览
        print("\n  数据概览:")
        self.df.show(10)
        
        # 显示数据统计
        print("\n  数据统计:")
        self.df.describe().show()
        
        return self.df
    
    def train_gmm(self, df, k=4):
        """
        训练高斯混合模型
        """
        print(f"\n=== 高斯混合模型训练 (K={k}) ===")
        
        # 1. 特征工程
        print("\n1. 特征工程")
        
        feature_cols = [
            "daily_active_hours", "page_views", "click_rate", "conversion_rate",
            "session_depth", "bounce_rate", "mobile_usage", "social_shares",
            "review_count", "avg_rating"
        ]
        
        # 特征向量化
        assembler = VectorAssembler(
            inputCols=feature_cols,
            outputCol="features_raw"
        )
        
        df_features = assembler.transform(df)
        
        # 特征标准化
        scaler = StandardScaler(
            inputCol="features_raw",
            outputCol="features",
            withStd=True,
            withMean=True
        )
        
        scaler_model = scaler.fit(df_features)
        df_scaled = scaler_model.transform(df_features)
        
        print("  特征工程完成")
        print(f"  特征维度: {len(feature_cols)}")
        print("  已进行标准化处理")
        
        # 2. 创建高斯混合模型
        print("\n2. 创建高斯混合模型")
        
        gmm = GaussianMixture(
            featuresCol="features",
            predictionCol="cluster",
            probabilityCol="probability",
            k=k,
            seed=42,
            maxIter=100,
            tol=1e-6
        )
        
        # 3. 训练模型
        print("\n3. 训练模型")
        gmm_model = gmm.fit(df_scaled)
        
        print("  模型训练完成")
        print(f"  聚类数量: {k}")
        print(f"  对数似然: {gmm_model.summary.logLikelihood:.4f}")
        
        # 4. 聚类预测
        print("\n4. 聚类预测")
        predictions = gmm_model.transform(df_scaled)
        
        print("  聚类结果样例:")
        predictions.select(
            "user_id", "daily_active_hours", "page_views", "click_rate",
            "conversion_rate", "cluster", "probability"
        ).show(10)
        
        return {
            'model': gmm_model,
            'predictions': predictions,
            'feature_cols': feature_cols,
            'scaler_model': scaler_model
        }
    
    def find_optimal_k_gmm(self, df, k_range=(2, 8)):
        """
        寻找最优的K值(基于BIC)
        """
        print(f"\n=== 寻找最优K值 (范围: {k_range[0]}-{k_range[1]-1}) ===")
        
        # 特征工程
        feature_cols = [
            "daily_active_hours", "page_views", "click_rate", "conversion_rate",
            "session_depth", "bounce_rate", "mobile_usage", "social_shares",
            "review_count", "avg_rating"
        ]
        
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_raw")
        df_features = assembler.transform(df)
        
        scaler = StandardScaler(
            inputCol="features_raw",
            outputCol="features",
            withStd=True,
            withMean=True
        )
        
        scaler_model = scaler.fit(df_features)
        df_scaled = scaler_model.transform(df_features)
        
        n_samples = df_scaled.count()
        
        # 测试不同的K值
        results = []
        
        for k in range(k_range[0], k_range[1]):
            print(f"\n  测试 K={k}")
            
            # 创建和训练模型
            gmm = GaussianMixture(
                featuresCol="features",
                predictionCol="cluster",
                k=k,
                seed=42,
                maxIter=100
            )
            
            model = gmm.fit(df_scaled)
            predictions = model.transform(df_scaled)
            
            # 计算评估指标
            log_likelihood = model.summary.logLikelihood
            n_params = k * (len(feature_cols) + len(feature_cols) * (len(feature_cols) + 1) // 2) + k - 1
            
            aic = 2 * n_params - 2 * log_likelihood
            bic = np.log(n_samples) * n_params - 2 * log_likelihood
            
            # 轮廓系数
            from pyspark.ml.evaluation import ClusteringEvaluator
            evaluator = ClusteringEvaluator(
                featuresCol="features",
                predictionCol="cluster",
                metricName="silhouette"
            )
            silhouette_score = evaluator.evaluate(predictions)
            
            result = {
                'k': k,
                'log_likelihood': log_likelihood,
                'aic': aic,
                'bic': bic,
                'silhouette_score': silhouette_score
            }
            
            results.append(result)
            
            print(f"    对数似然: {log_likelihood:.4f}")
            print(f"    AIC: {aic:.4f}")
            print(f"    BIC: {bic:.4f}")
            print(f"    轮廓系数: {silhouette_score:.4f}")
        
        # 找到最优K值
        best_k_bic = min(results, key=lambda x: x['bic'])
        best_k_silhouette = max(results, key=lambda x: x['silhouette_score'])
        
        print(f"\n  最优K值分析:")
        print(f"    基于BIC的最优K: {best_k_bic['k']} (BIC: {best_k_bic['bic']:.4f})")
        print(f"    基于轮廓系数的最优K: {best_k_silhouette['k']} (轮廓系数: {best_k_silhouette['silhouette_score']:.4f})")
        
        return results
    
    def analyze_gmm_clusters(self, model_results):
        """
        分析GMM聚类结果,提供业务洞察
        """
        print("\n=== GMM聚类结果业务分析 ===")
        
        predictions = model_results['predictions']
        feature_cols = model_results['feature_cols']
        
        # 计算每个聚类的特征统计
        cluster_stats = []
        
        # 获取聚类数量
        cluster_count = predictions.select("cluster").distinct().count()
        
        for cluster_id in range(cluster_count):
            cluster_data = predictions.filter(col("cluster") == cluster_id)
            
            stats = {}
            stats['cluster_id'] = cluster_id
            stats['count'] = cluster_data.count()
            
            # 计算各特征的平均值
            for feature in feature_cols:
                avg_value = cluster_data.agg(avg(col(feature)).alias(f"avg_{feature}")).collect()[0][f"avg_{feature}"]
                stats[f'avg_{feature}'] = avg_value
            
            cluster_stats.append(stats)
        
        # 输出聚类特征分析
        print("\n  各聚类特征分析:")
        for stats in cluster_stats:
            cluster_id = stats['cluster_id']
            count = stats['count']
            
            print(f"\n  聚类 {cluster_id} ({count} 个用户):")
            print(f"    平均日活跃时长: {stats['avg_daily_active_hours']:.2f} 小时")
            print(f"    平均页面浏览数: {stats['avg_page_views']:.0f} 次")
            print(f"    平均点击率: {stats['avg_click_rate']:.3f}")
            print(f"    平均转化率: {stats['avg_conversion_rate']:.3f}")
            print(f"    平均会话深度: {stats['avg_session_depth']:.1f} 页面")
            print(f"    平均跳出率: {stats['avg_bounce_rate']:.3f}")
            print(f"    平均移动端使用比例: {stats['avg_mobile_usage']:.3f}")
            print(f"    平均社交分享次数: {stats['avg_social_shares']:.1f} 次/月")
            print(f"    平均评论次数: {stats['avg_review_count']:.1f} 次/月")
            print(f"    平均评分: {stats['avg_avg_rating']:.2f}")
        
        # 用户群体标签
        print("\n  用户群体标签建议:")
        for stats in cluster_stats:
            cluster_id = stats['cluster_id']
            
            # 基于特征值给出标签建议
            if stats['avg_daily_active_hours'] > 6 and stats['avg_conversion_rate'] > 0.1:
                label = "高价值活跃用户"
                strategy = "重点维护,提供个性化推荐和优质服务"
            elif stats['avg_click_rate'] > 0.2 and stats['avg_page_views'] > 100:
                label = "高参与度用户"
                strategy = "内容营销,增加互动功能"
            elif stats['avg_mobile_usage'] > 0.8 and stats['avg_social_shares'] > 5:
                label = "移动社交用户"
                strategy = "移动端优化,社交功能推广"
            elif stats['avg_bounce_rate'] > 0.7:
                label = "流失风险用户"
                strategy = "改善用户体验,提供引导和帮助"
            elif stats['avg_conversion_rate'] < 0.02:
                label = "潜在转化用户"
                strategy = "优化转化漏斗,提供激励措施"
            else:
                label = "普通用户"
                strategy = "标准化服务,定期活动推送"
            
            print(f"    聚类 {cluster_id}: {label}")
            print(f"      运营策略: {strategy}")
        
        return cluster_stats
    
    def visualize_gmm_results(self, model_results, k_optimization_results=None):
        """
        可视化GMM聚类结果
        """
        import matplotlib.pyplot as plt
        import seaborn as sns
        import numpy as np
        import pandas as pd
        from sklearn.decomposition import PCA
        
        # 设置中文字体
        plt.rcParams['font.sans-serif'] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 创建图形
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('高斯混合模型聚类分析结果', fontsize=16, fontweight='bold')
        
        predictions = model_results['predictions']
        feature_cols = model_results['feature_cols']
        
        # 转换为Pandas DataFrame进行可视化
        df_pandas = predictions.select(
            *feature_cols, "cluster", "probability"
        ).toPandas()
        
        # 1. PCA降维可视化
        features_for_pca = df_pandas[feature_cols].values
        pca = PCA(n_components=2)
        features_2d = pca.fit_transform(features_for_pca)
        
        scatter = axes[0, 0].scatter(
            features_2d[:, 0], features_2d[:, 1], 
            c=df_pandas['cluster'], cmap='viridis', alpha=0.6
        )
        axes[0, 0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} 方差)')
        axes[0, 0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} 方差)')
        axes[0, 0].set_title('PCA降维聚类可视化')
        plt.colorbar(scatter, ax=axes[0, 0])
        
        # 2. 聚类分布
        cluster_counts = df_pandas['cluster'].value_counts().sort_index()
        
        bars = axes[0, 1].bar(cluster_counts.index, cluster_counts.values, alpha=0.7)
        axes[0, 1].set_xlabel('聚类ID')
        axes[0, 1].set_ylabel('用户数量')
        axes[0, 1].set_title('聚类分布')
        axes[0, 1].grid(True, alpha=0.3)
        
        # 3. 特征重要性(基于方差)
        cluster_means = []
        for cluster_id in range(len(cluster_counts)):
            cluster_data = df_pandas[df_pandas['cluster'] == cluster_id]
            means = [cluster_data[col].mean() for col in feature_cols]
            cluster_means.append(means)
        
        cluster_means = np.array(cluster_means)
        feature_variance = np.var(cluster_means, axis=0)
        
        sorted_idx = np.argsort(feature_variance)[::-1]
        sorted_features = [feature_cols[i] for i in sorted_idx]
        sorted_variance = feature_variance[sorted_idx]
        
        bars = axes[0, 2].bar(range(len(sorted_features)), sorted_variance, alpha=0.7)
        axes[0, 2].set_xticks(range(len(sorted_features)))
        axes[0, 2].set_xticklabels(sorted_features, rotation=45)
        axes[0, 2].set_ylabel('聚类间方差')
        axes[0, 2].set_title('特征区分度')
        axes[0, 2].grid(True, alpha=0.3)
        
        # 4. K值优化曲线(如果有优化结果)
        if k_optimization_results:
            k_values = [r['k'] for r in k_optimization_results]
            bic_values = [r['bic'] for r in k_optimization_results]
            aic_values = [r['aic'] for r in k_optimization_results]
            
            axes[1, 0].plot(k_values, bic_values, 'bo-', label='BIC', linewidth=2)
            axes[1, 0].plot(k_values, aic_values, 'ro-', label='AIC', linewidth=2)
            axes[1, 0].set_xlabel('K值')
            axes[1, 0].set_ylabel('信息准则')
            axes[1, 0].set_title('模型选择曲线')
            axes[1, 0].legend()
            axes[1, 0].grid(True, alpha=0.3)
        
        # 5. 特征分布对比
        # 选择几个重要特征进行对比
        important_features = sorted_features[:4]
        
        for i, feature in enumerate(important_features):
            if i < 2:
                ax = axes[1, 1] if i == 0 else axes[1, 2]
                
                for cluster_id in cluster_counts.index:
                    cluster_data = df_pandas[df_pandas['cluster'] == cluster_id]
                    ax.hist(cluster_data[feature], alpha=0.5, 
                           label=f'聚类{cluster_id}', bins=20)
                
                ax.set_xlabel(feature)
                ax.set_ylabel('频次')
                ax.set_title(f'{feature} 分布对比')
                ax.legend()
                ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # 输出详细分析
        print("\n=== 高斯混合模型分析报告 ===")
        
        print(f"\n特征区分度分析:")
        top_features = sorted(zip(feature_cols, feature_variance), 
                            key=lambda x: x[1], reverse=True)[:5]
        for feature, variance in top_features:
            print(f"  - {feature}: {variance:.4f}")
        
        if k_optimization_results:
            best_k_bic = min(k_optimization_results, key=lambda x: x['bic'])
            print(f"\n最优K值建议:")
            print(f"  - 推荐K值 (BIC): {best_k_bic['k']}")
            print(f"  - 对应BIC: {best_k_bic['bic']:.4f}")
        
        return {
            'pca_variance_ratio': pca.explained_variance_ratio_,
            'feature_importance': dict(zip(feature_cols, feature_variance))
        }

# 演示代码
if __name__ == "__main__":
    # 创建演示实例
    demo = GMMDemo()
    
    # 1. 创建数据集
    print("=== 创建用户行为分析数据集 ===")
    df = demo.create_dataset(n_samples=2500)
    
    # 2. 寻找最优K值
    k_optimization_results = demo.find_optimal_k_gmm(df, k_range=(2, 8))
    
    # 3. 训练GMM模型(使用最优K值)
    best_k = min(k_optimization_results, key=lambda x: x['bic'])['k']
    model_results = demo.train_gmm(df, k=best_k)
    
    # 4. 聚类分析
    cluster_stats = demo.analyze_gmm_clusters(model_results)
    
    # 5. 可视化结果
    demo.visualize_gmm_results(model_results, k_optimization_results)
    
    print("\n=== 高斯混合模型演示完成 ===")

总结:

高斯混合模型具有以下特点:

  1. 优点

    • 提供概率归属,支持软聚类
    • 能够处理椭圆形聚类
    • 有理论基础的模型选择准则(AIC/BIC)
    • 能够处理不同大小和形状的聚类
  2. 缺点

    • 计算复杂度较高
    • 对初始化敏感
    • 需要假设数据服从高斯分布
    • 参数较多,容易过拟合
  3. 适用场景

    • 用户行为分析
    • 异常检测
    • 密度估计
    • 需要概率输出的场景
  4. 业务建议

    • 用户行为分析中,活跃度和转化率是关键指标
    • 建议使用BIC选择最优聚类数
    • 利用概率输出进行精准营销

6. 推荐系统

推荐系统是机器学习在商业应用中的重要领域,用于为用户推荐感兴趣的内容或商品。

6.1 协同过滤 (ALS)

交替最小二乘法(ALS)是实现协同过滤的经典算法,特别适合处理大规模稀疏数据。

from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np

class ALSDemo:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("ALSDemo") \
            .getOrCreate()
    
    def create_dataset(self, n_users=1000, n_items=500, n_ratings=50000):
        """
        创建电影评分数据集
        """
        print(f"\n创建电影评分数据集")
        print(f"  用户数: {n_users}, 电影数: {n_items}, 评分数: {n_ratings}")
        
        np.random.seed(42)
        
        # 生成用户ID和电影ID
        user_ids = np.random.randint(1, n_users + 1, n_ratings)
        item_ids = np.random.randint(1, n_items + 1, n_ratings)
        
        # 生成评分(1-5分,偏向高分)
        ratings = np.random.choice([1, 2, 3, 4, 5], n_ratings, 
                                 p=[0.05, 0.1, 0.2, 0.35, 0.3])
        
        # 添加一些用户偏好模式
        # 某些用户倾向于给高分
        high_rating_users = np.random.choice(range(1, n_users + 1), n_users // 10)
        for i, user_id in enumerate(user_ids):
            if user_id in high_rating_users and np.random.random() < 0.7:
                ratings[i] = min(5, ratings[i] + 1)
        
        # 某些电影更受欢迎
        popular_items = np.random.choice(range(1, n_items + 1), n_items // 20)
        for i, item_id in enumerate(item_ids):
            if item_id in popular_items and np.random.random() < 0.6:
                ratings[i] = min(5, ratings[i] + 1)
        
        # 创建DataFrame
        data = list(zip(user_ids.tolist(), item_ids.tolist(), ratings.tolist()))
        
        # 去重(同一用户对同一电影只保留一个评分)
        data_dict = {}
        for user_id, item_id, rating in data:
            key = (user_id, item_id)
            if key not in data_dict:
                data_dict[key] = rating
        
        final_data = [(user_id, item_id, rating) 
                     for (user_id, item_id), rating in data_dict.items()]
        
        schema = StructType([
            StructField("user_id", IntegerType(), True),
            StructField("item_id", IntegerType(), True),
            StructField("rating", IntegerType(), True)
        ])
        
        self.df = self.spark.createDataFrame(final_data, schema)
        
        print(f"  实际生成评分数: {self.df.count()}")
        
        # 显示数据概览
        print("\n  数据概览:")
        self.df.show(10)
        
        # 显示数据统计
        print("\n  数据统计:")
        self.df.describe().show()
        
        # 用户和物品统计
        unique_users = self.df.select("user_id").distinct().count()
        unique_items = self.df.select("item_id").distinct().count()
        
        print(f"\n  实际用户数: {unique_users}")
        print(f"  实际电影数: {unique_items}")
        print(f"  稀疏度: {1 - self.df.count() / (unique_users * unique_items):.4f}")
        
        return self.df
    
    def train_als(self, df, rank=10, max_iter=10, reg_param=0.1):
        """
        训练ALS协同过滤模型
        """
        print(f"\n=== ALS协同过滤模型训练 ===")
        print(f"  隐因子维度: {rank}")
        print(f"  最大迭代次数: {max_iter}")
        print(f"  正则化参数: {reg_param}")
        
        # 1. 数据分割
        print("\n1. 数据分割")
        train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)
        
        print(f"  训练集大小: {train_data.count()}")
        print(f"  测试集大小: {test_data.count()}")
        
        # 2. 创建ALS模型
        print("\n2. 创建ALS模型")
        
        als = ALS(
            userCol="user_id",
            itemCol="item_id",
            ratingCol="rating",
            rank=rank,
            maxIter=max_iter,
            regParam=reg_param,
            coldStartStrategy="drop",
            seed=42
        )
        
        # 3. 训练模型
        print("\n3. 训练模型")
        als_model = als.fit(train_data)
        
        print("  模型训练完成")
        print(f"  用户因子矩阵维度: {als_model.userFactors.count()} x {rank}")
        print(f"  物品因子矩阵维度: {als_model.itemFactors.count()} x {rank}")
        
        # 4. 模型预测
        print("\n4. 模型预测")
        predictions = als_model.transform(test_data)
        
        print("  预测结果样例:")
        predictions.select(
            "user_id", "item_id", "rating", "prediction"
        ).show(10)
        
        # 5. 模型评估
        print("\n5. 模型评估")
        
        evaluator = RegressionEvaluator(
            metricName="rmse",
            labelCol="rating",
            predictionCol="prediction"
        )
        
        rmse = evaluator.evaluate(predictions)
        
        evaluator_mae = RegressionEvaluator(
            metricName="mae",
            labelCol="rating",
            predictionCol="prediction"
        )
        
        mae = evaluator_mae.evaluate(predictions)
        
        print(f"  RMSE: {rmse:.4f}")
        print(f"  MAE: {mae:.4f}")
        
        # 6. 推荐质量分析
        print("\n6. 推荐质量分析")
        
        # 预测值分布
        pred_stats = predictions.select("prediction").describe()
        pred_stats.show()
        
        # 预测准确性分析
        accuracy_analysis = predictions.withColumn(
            "error", abs(col("rating") - col("prediction"))
        ).withColumn(
            "accurate", when(col("error") <= 0.5, 1).otherwise(0)
        )
        
        accuracy_rate = accuracy_analysis.agg(
            avg("accurate").alias("accuracy_rate")
        ).collect()[0]["accuracy_rate"]
        
        print(f"  预测准确率 (误差≤0.5): {accuracy_rate:.4f}")
        
        return {
            'model': als_model,
            'predictions': predictions,
            'train_data': train_data,
            'test_data': test_data,
            'metrics': {
                'rmse': rmse,
                'mae': mae,
                'accuracy_rate': accuracy_rate
            }
        }
    
    def hyperparameter_tuning_als(self, df):
        """
        ALS超参数调优
        """
        print("\n=== ALS超参数调优 ===")
        
        # 数据分割
        train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)
        
        # 参数网格
        ranks = [5, 10, 15]
        reg_params = [0.01, 0.1, 1.0]
        
        print(f"  测试参数组合数: {len(ranks) * len(reg_params)}")
        
        best_rmse = float('inf')
        best_params = None
        results = []
        
        evaluator = RegressionEvaluator(
            metricName="rmse",
            labelCol="rating",
            predictionCol="prediction"
        )
        
        for rank in ranks:
            for reg_param in reg_params:
                print(f"\n  测试参数: rank={rank}, regParam={reg_param}")
                
                # 创建和训练模型
                als = ALS(
                    userCol="user_id",
                    itemCol="item_id",
                    ratingCol="rating",
                    rank=rank,
                    maxIter=10,
                    regParam=reg_param,
                    coldStartStrategy="drop",
                    seed=42
                )
                
                model = als.fit(train_data)
                predictions = model.transform(test_data)
                
                rmse = evaluator.evaluate(predictions)
                
                result = {
                    'rank': rank,
                    'reg_param': reg_param,
                    'rmse': rmse
                }
                
                results.append(result)
                
                print(f"    RMSE: {rmse:.4f}")
                
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_params = {'rank': rank, 'reg_param': reg_param}
        
        print(f"\n  最佳参数:")
        print(f"    rank: {best_params['rank']}")
        print(f"    regParam: {best_params['reg_param']}")
        print(f"    最佳RMSE: {best_rmse:.4f}")
        
        return {
            'best_params': best_params,
            'best_rmse': best_rmse,
            'all_results': results
        }
    
    def generate_recommendations(self, model_results, user_id, num_recommendations=10):
        """
        为指定用户生成推荐
        """
        print(f"\n=== 为用户 {user_id} 生成推荐 ===")
        
        als_model = model_results['model']
        
        # 获取该用户已评分的电影
        user_ratings = model_results['train_data'].filter(
            col("user_id") == user_id
        ).select("item_id").rdd.map(lambda x: x[0]).collect()
        
        print(f"  用户已评分电影数: {len(user_ratings)}")
        
        # 获取所有电影
        all_items = model_results['train_data'].select("item_id").distinct()
        
        # 过滤掉已评分的电影
        unrated_items = all_items.filter(~col("item_id").isin(user_ratings))
        
        # 为用户创建候选电影列表
        user_unrated = unrated_items.withColumn("user_id", lit(user_id))
        
        print(f"  候选推荐电影数: {user_unrated.count()}")
        
        # 生成预测评分
        recommendations = als_model.transform(user_unrated)
        
        # 获取Top-N推荐
        top_recommendations = recommendations.orderBy(
            col("prediction").desc()
        ).limit(num_recommendations)
        
        print(f"\n  Top-{num_recommendations} 推荐:")
        top_recommendations.select(
            "item_id", "prediction"
        ).show(num_recommendations)
        
        return top_recommendations
    
    def evaluate_recommendations(self, model_results, k=10):
        """
        评估推荐质量
        """
        print(f"\n=== 推荐质量评估 (Top-{k}) ===")
        
        als_model = model_results['model']
        test_data = model_results['test_data']
        
        # 为所有用户生成推荐
        users = test_data.select("user_id").distinct()
        
        print(f"  评估用户数: {users.count()}")
        
        # 生成用户推荐
        user_recs = als_model.recommendForUserSubset(users, k)
        
        print("  用户推荐样例:")
        user_recs.show(5, truncate=False)
        
        # 计算覆盖率
        all_items = model_results['train_data'].select("item_id").distinct().count()
        
        recommended_items = user_recs.select(
            explode(col("recommendations.item_id")).alias("item_id")
        ).distinct().count()
        
        coverage = recommended_items / all_items
        
        print(f"\n  推荐系统指标:")
        print(f"    物品覆盖率: {coverage:.4f}")
        print(f"    推荐物品数: {recommended_items}")
        print(f"    总物品数: {all_items}")
        
        # 计算推荐多样性
        avg_recommendations = user_recs.agg(
            avg(size(col("recommendations"))).alias("avg_recs")
        ).collect()[0]["avg_recs"]
        
        print(f"    平均推荐数: {avg_recommendations:.2f}")
        
        return {
            'coverage': coverage,
            'recommended_items': recommended_items,
            'total_items': all_items,
            'user_recommendations': user_recs
        }
    
    def visualize_als_results(self, model_results, tuning_results=None):
        """
        可视化ALS推荐系统结果
        """
        import matplotlib.pyplot as plt
        import seaborn as sns
        import numpy as np
        import pandas as pd
        
        # 设置中文字体
        plt.rcParams['font.sans-serif'] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 创建图形
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('ALS协同过滤推荐系统分析结果', fontsize=16, fontweight='bold')
        
        predictions = model_results['predictions']
        metrics = model_results['metrics']
        
        # 转换为Pandas DataFrame进行可视化
        df_pandas = predictions.select(
            "user_id", "item_id", "rating", "prediction"
        ).toPandas()
        
        # 1. 预测值 vs 实际值散点图
        axes[0, 0].scatter(df_pandas['rating'], df_pandas['prediction'], 
                          alpha=0.5, s=10)
        axes[0, 0].plot([1, 5], [1, 5], 'r--', linewidth=2)
        axes[0, 0].set_xlabel('实际评分')
        axes[0, 0].set_ylabel('预测评分')
        axes[0, 0].set_title('预测值 vs 实际值')
        axes[0, 0].grid(True, alpha=0.3)
        
        # 添加相关系数
        correlation = df_pandas['rating'].corr(df_pandas['prediction'])
        axes[0, 0].text(0.05, 0.95, f'相关系数: {correlation:.3f}', 
                        transform=axes[0, 0].transAxes, 
                        bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        
        # 2. 误差分布
        errors = df_pandas['prediction'] - df_pandas['rating']
        axes[0, 1].hist(errors, bins=50, alpha=0.7, edgecolor='black')
        axes[0, 1].axvline(errors.mean(), color='red', linestyle='--', 
                          label=f'均值: {errors.mean():.3f}')
        axes[0, 1].axvline(0, color='green', linestyle='-', 
                          label='理想值: 0')
        axes[0, 1].set_xlabel('预测误差 (预测值 - 实际值)')
        axes[0, 1].set_ylabel('频次')
        axes[0, 1].set_title('预测误差分布')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3)
        
        # 3. 评分分布对比
        x = np.arange(1, 6)
        actual_counts = [sum(df_pandas['rating'] == i) for i in x]
        pred_counts = [sum((df_pandas['prediction'] >= i-0.5) & 
                          (df_pandas['prediction'] < i+0.5)) for i in x]
        
        width = 0.35
        axes[0, 2].bar(x - width/2, actual_counts, width, 
                      label='实际评分', alpha=0.7)
        axes[0, 2].bar(x + width/2, pred_counts, width, 
                      label='预测评分', alpha=0.7)
        axes[0, 2].set_xlabel('评分')
        axes[0, 2].set_ylabel('数量')
        axes[0, 2].set_title('评分分布对比')
        axes[0, 2].set_xticks(x)
        axes[0, 2].legend()
        axes[0, 2].grid(True, alpha=0.3)
        
        # 4. 超参数调优结果(如果有)
        if tuning_results:
            results = tuning_results['all_results']
            
            # 创建热力图数据
            ranks = sorted(list(set([r['rank'] for r in results])))
            reg_params = sorted(list(set([r['reg_param'] for r in results])))
            
            heatmap_data = np.zeros((len(reg_params), len(ranks)))
            
            for result in results:
                i = reg_params.index(result['reg_param'])
                j = ranks.index(result['rank'])
                heatmap_data[i, j] = result['rmse']
            
            im = axes[1, 0].imshow(heatmap_data, cmap='YlOrRd', aspect='auto')
            axes[1, 0].set_xticks(range(len(ranks)))
            axes[1, 0].set_xticklabels(ranks)
            axes[1, 0].set_yticks(range(len(reg_params)))
            axes[1, 0].set_yticklabels([f'{p:.2f}' for p in reg_params])
            axes[1, 0].set_xlabel('隐因子维度 (rank)')
            axes[1, 0].set_ylabel('正则化参数')
            axes[1, 0].set_title('超参数调优热力图 (RMSE)')
            
            # 添加数值标注
            for i in range(len(reg_params)):
                for j in range(len(ranks)):
                    axes[1, 0].text(j, i, f'{heatmap_data[i, j]:.3f}',
                                   ha='center', va='center', color='black')
            
            plt.colorbar(im, ax=axes[1, 0])
        
        # 5. 用户评分行为分析
        user_stats = df_pandas.groupby('user_id').agg({
            'rating': ['count', 'mean', 'std']
        }).reset_index()
        user_stats.columns = ['user_id', 'rating_count', 'rating_mean', 'rating_std']
        
        axes[1, 1].scatter(user_stats['rating_count'], user_stats['rating_mean'], 
                          alpha=0.6, s=20)
        axes[1, 1].set_xlabel('用户评分数量')
        axes[1, 1].set_ylabel('用户平均评分')
        axes[1, 1].set_title('用户评分行为分析')
        axes[1, 1].grid(True, alpha=0.3)
        
        # 6. 物品评分分析
        item_stats = df_pandas.groupby('item_id').agg({
            'rating': ['count', 'mean']
        }).reset_index()
        item_stats.columns = ['item_id', 'rating_count', 'rating_mean']
        
        axes[1, 2].scatter(item_stats['rating_count'], item_stats['rating_mean'], 
                          alpha=0.6, s=20)
        axes[1, 2].set_xlabel('物品评分数量')
        axes[1, 2].set_ylabel('物品平均评分')
        axes[1, 2].set_title('物品受欢迎程度分析')
        axes[1, 2].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # 输出详细分析
        print("\n=== ALS推荐系统分析报告 ===")
        print(f"模型性能指标:")
        print(f"  - RMSE: {metrics['rmse']:.4f}")
        print(f"  - MAE: {metrics['mae']:.4f}")
        print(f"  - 预测准确率: {metrics['accuracy_rate']:.4f}")
        print(f"  - 预测-实际相关系数: {correlation:.4f}")
        
        print(f"\n误差分析:")
        print(f"  - 平均误差: {errors.mean():.4f}")
        print(f"  - 误差标准差: {errors.std():.4f}")
        print(f"  - 误差范围: [{errors.min():.4f}, {errors.max():.4f}]")
        
        print(f"\n用户行为分析:")
        print(f"  - 平均每用户评分数: {user_stats['rating_count'].mean():.2f}")
        print(f"  - 用户平均评分: {user_stats['rating_mean'].mean():.3f}")
        print(f"  - 活跃用户数 (评分>10): {sum(user_stats['rating_count'] > 10)}")
        
        print(f"\n物品分析:")
        print(f"  - 平均每物品评分数: {item_stats['rating_count'].mean():.2f}")
        print(f"  - 物品平均评分: {item_stats['rating_mean'].mean():.3f}")
        print(f"  - 热门物品数 (评分>10): {sum(item_stats['rating_count'] > 10)}")
        
        if tuning_results:
            best_params = tuning_results['best_params']
            print(f"\n最优参数:")
            print(f"  - 最佳rank: {best_params['rank']}")
            print(f"  - 最佳regParam: {best_params['reg_param']}")
            print(f"  - 最佳RMSE: {tuning_results['best_rmse']:.4f}")
        
        return {
            'correlation': correlation,
            'error_stats': {
                'mean': errors.mean(),
                'std': errors.std(),
                'min': errors.min(),
                'max': errors.max()
            },
            'user_stats': user_stats,
            'item_stats': item_stats
        }

# 演示代码
if __name__ == "__main__":
    # 创建演示实例
    demo = ALSDemo()
    
    # 1. 创建数据集
    print("=== 创建电影评分数据集 ===")
    df = demo.create_dataset(n_users=1000, n_items=500, n_ratings=50000)
    
    # 2. 超参数调优
    tuning_results = demo.hyperparameter_tuning_als(df)
    
    # 3. 使用最佳参数训练模型
    best_params = tuning_results['best_params']
    model_results = demo.train_als(
        df, 
        rank=best_params['rank'], 
        reg_param=best_params['reg_param']
    )
    
    # 4. 生成推荐
    user_id = 1  # 为用户1生成推荐
    recommendations = demo.generate_recommendations(model_results, user_id, 10)
    
    # 5. 评估推荐质量
    eval_results = demo.evaluate_recommendations(model_results, k=10)
    
    # 6. 可视化结果
    demo.visualize_als_results(model_results, tuning_results)
    
    print("\n=== ALS协同过滤演示完成 ===")

总结:

ALS协同过滤算法具有以下特点:

  1. 优点

    • 能够处理大规模稀疏数据
    • 支持并行计算,适合分布式环境
    • 能够发现隐含的用户-物品关系
    • 对冷启动问题有一定的处理能力
  2. 缺点

    • 需要大量的用户-物品交互数据
    • 对新用户和新物品的推荐效果较差
    • 缺乏可解释性
    • 容易产生流行度偏差
  3. 适用场景

    • 电影、音乐、图书推荐
    • 电商商品推荐
    • 内容推荐系统
    • 社交网络推荐
  4. 业务建议

    • 结合内容特征进行混合推荐
    • 定期重训练模型以适应用户偏好变化
    • 考虑时间因素和季节性影响
    • 平衡推荐的准确性和多样性

7. 模型评估与优化

7.1 交叉验证

交叉验证是评估机器学习模型泛化能力的重要方法。

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, StandardScaler

class CrossValidationDemo:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("CrossValidationDemo") \
            .getOrCreate()
    
    def create_classification_dataset(self, n_samples=10000):
        """
        创建分类数据集
        """
        import numpy as np
        from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType
        
        # 生成模拟数据
        np.random.seed(42)
        
        # 特征1: 年龄 (18-80)
        age = np.random.normal(40, 15, n_samples)
        age = np.clip(age, 18, 80)
        
        # 特征2: 收入 (20000-200000)
        income = np.random.lognormal(10.5, 0.8, n_samples)
        income = np.clip(income, 20000, 200000)
        
        # 特征3: 信用评分 (300-850)
        credit_score = np.random.normal(650, 100, n_samples)
        credit_score = np.clip(credit_score, 300, 850)
        
        # 特征4: 工作年限 (0-40)
        work_years = np.random.exponential(8, n_samples)
        work_years = np.clip(work_years, 0, 40)
        
        # 特征5: 负债比率 (0-1)
        debt_ratio = np.random.beta(2, 5, n_samples)
        
        # 生成标签(贷款违约:0=正常,1=违约)
        # 基于特征的逻辑回归模型
        logit = (-8 + 
                0.02 * age + 
                0.00002 * income + 
                0.01 * credit_score + 
                0.05 * work_years - 
                5 * debt_ratio + 
                np.random.normal(0, 1, n_samples))
        
        probability = 1 / (1 + np.exp(-logit))
        labels = (np.random.random(n_samples) < probability).astype(int)
        
        # 创建DataFrame
        schema = StructType([
            StructField("age", DoubleType(), True),
            StructField("income", DoubleType(), True),
            StructField("credit_score", DoubleType(), True),
            StructField("work_years", DoubleType(), True),
            StructField("debt_ratio", DoubleType(), True),
            StructField("label", IntegerType(), True)
        ])
        
        data = list(zip(age, income, credit_score, work_years, debt_ratio, labels))
        df = self.spark.createDataFrame(data, schema)
        
        # 数据概览
        print("数据集概览:")
        df.show(10)
        
        print("\n数据统计:")
        df.describe().show()
        
        print("\n标签分布:")
        df.groupBy("label").count().show()
        
        return df
    
    def cross_validation_logistic(self, df, num_folds=5):
        """
        逻辑回归交叉验证
        """
        from pyspark.ml.feature import VectorAssembler, StandardScaler
        from pyspark.ml.classification import LogisticRegression
        from pyspark.ml.evaluation import BinaryClassificationEvaluator
        from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
        from pyspark.ml import Pipeline
        
        # 特征工程
        feature_cols = ["age", "income", "credit_score", "work_years", "debt_ratio"]
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_raw")
        scaler = StandardScaler(inputCol="features_raw", outputCol="features")
        
        # 逻辑回归模型
        lr = LogisticRegression(featuresCol="features", labelCol="label")
        
        # 创建Pipeline
        pipeline = Pipeline(stages=[assembler, scaler, lr])
        
        # 参数网格
        param_grid = ParamGridBuilder() \
            .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
            .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
            .build()
        
        # 评估器
        evaluator = BinaryClassificationEvaluator(
            labelCol="label", 
            rawPredictionCol="rawPrediction",
            metricName="areaUnderROC"
        )
        
        # 交叉验证
        cv = CrossValidator(
            estimator=pipeline,
            estimatorParamMaps=param_grid,
            evaluator=evaluator,
            numFolds=num_folds,
            seed=42
        )
        
        print(f"开始{num_folds}折交叉验证...")
        cv_model = cv.fit(df)
        
        # 获取最佳模型
        best_model = cv_model.bestModel
        best_lr = best_model.stages[-1]
        
        # 获取最佳参数
        best_reg_param = best_lr.getRegParam()
        best_elastic_net = best_lr.getElasticNetParam()
        
        # 交叉验证分数
        cv_scores = cv_model.avgMetrics
        best_score = max(cv_scores)
        
        print(f"\n交叉验证结果:")
        print(f"最佳AUC分数: {best_score:.4f}")
        print(f"最佳正则化参数: {best_reg_param}")
        print(f"最佳弹性网络参数: {best_elastic_net}")
        
        # 所有参数组合的结果
        results = []
        for i, (params, score) in enumerate(zip(param_grid, cv_scores)):
            reg_param = params[lr.regParam]
            elastic_net = params[lr.elasticNetParam]
            results.append({
                'reg_param': reg_param,
                'elastic_net': elastic_net,
                'auc_score': score
            })
        
        return {
            'best_model': cv_model,
            'best_score': best_score,
            'best_params': {
                'reg_param': best_reg_param,
                'elastic_net': best_elastic_net
            },
            'all_results': results,
            'cv_scores': cv_scores
        }
    
    def manual_cross_validation(self, df, num_folds=5):
        """
        手动实现交叉验证
        """
        from pyspark.ml.feature import VectorAssembler, StandardScaler
        from pyspark.ml.classification import LogisticRegression
        from pyspark.ml.evaluation import BinaryClassificationEvaluator
        from pyspark.ml import Pipeline
        import numpy as np
        
        # 添加随机折数列
        df_with_fold = df.withColumn(
            "fold", 
            (F.rand(seed=42) * num_folds).cast("int")
        )
        
        fold_results = []
        
        for fold in range(num_folds):
            print(f"\n训练第 {fold + 1} 折...")
            
            # 分割数据
            train_df = df_with_fold.filter(F.col("fold") != fold)
            test_df = df_with_fold.filter(F.col("fold") == fold)
            
            # 特征工程
            feature_cols = ["age", "income", "credit_score", "work_years", "debt_ratio"]
            assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_raw")
            scaler = StandardScaler(inputCol="features_raw", outputCol="features")
            lr = LogisticRegression(featuresCol="features", labelCol="label")
            
            # 创建Pipeline
            pipeline = Pipeline(stages=[assembler, scaler, lr])
            
            # 训练模型
            model = pipeline.fit(train_df)
            
            # 预测
            predictions = model.transform(test_df)
            
            # 评估
            evaluator = BinaryClassificationEvaluator(
                labelCol="label", 
                rawPredictionCol="rawPrediction"
            )
            
            auc = evaluator.evaluate(predictions)
            
            # 计算其他指标
            tp = predictions.filter((F.col("label") == 1) & (F.col("prediction") == 1)).count()
            tn = predictions.filter((F.col("label") == 0) & (F.col("prediction") == 0)).count()
            fp = predictions.filter((F.col("label") == 0) & (F.col("prediction") == 1)).count()
            fn = predictions.filter((F.col("label") == 1) & (F.col("prediction") == 0)).count()
            
            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
            accuracy = (tp + tn) / (tp + tn + fp + fn)
            
            fold_result = {
                'fold': fold + 1,
                'auc': auc,
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'train_size': train_df.count(),
                'test_size': test_df.count()
            }
            
            fold_results.append(fold_result)
            
            print(f"第 {fold + 1} 折结果:")
            print(f"  AUC: {auc:.4f}")
            print(f"  准确率: {accuracy:.4f}")
            print(f"  精确率: {precision:.4f}")
            print(f"  召回率: {recall:.4f}")
            print(f"  F1分数: {f1:.4f}")
        
        # 计算平均指标
        avg_metrics = {
            'avg_auc': np.mean([r['auc'] for r in fold_results]),
            'std_auc': np.std([r['auc'] for r in fold_results]),
            'avg_accuracy': np.mean([r['accuracy'] for r in fold_results]),
            'std_accuracy': np.std([r['accuracy'] for r in fold_results]),
            'avg_precision': np.mean([r['precision'] for r in fold_results]),
            'std_precision': np.std([r['precision'] for r in fold_results]),
            'avg_recall': np.mean([r['recall'] for r in fold_results]),
            'std_recall': np.std([r['recall'] for r in fold_results]),
            'avg_f1': np.mean([r['f1'] for r in fold_results]),
            'std_f1': np.std([r['f1'] for r in fold_results])
        }
        
        print(f"\n{num_folds}折交叉验证平均结果:")
        print(f"AUC: {avg_metrics['avg_auc']:.4f} ± {avg_metrics['std_auc']:.4f}")
        print(f"准确率: {avg_metrics['avg_accuracy']:.4f} ± {avg_metrics['std_accuracy']:.4f}")
        print(f"精确率: {avg_metrics['avg_precision']:.4f} ± {avg_metrics['std_precision']:.4f}")
        print(f"召回率: {avg_metrics['avg_recall']:.4f} ± {avg_metrics['std_recall']:.4f}")
        print(f"F1分数: {avg_metrics['avg_f1']:.4f} ± {avg_metrics['std_f1']:.4f}")
        
        return {
            'fold_results': fold_results,
            'avg_metrics': avg_metrics
        }
    
    def compare_models_cv(self, df, num_folds=5):
        """
        比较不同模型的交叉验证性能
        """
        from pyspark.ml.feature import VectorAssembler, StandardScaler
        from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
        from pyspark.ml.evaluation import BinaryClassificationEvaluator
        from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
        from pyspark.ml import Pipeline
        
        # 特征工程
        feature_cols = ["age", "income", "credit_score", "work_years", "debt_ratio"]
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_raw")
        scaler = StandardScaler(inputCol="features_raw", outputCol="features")
        
        # 评估器
        evaluator = BinaryClassificationEvaluator(
            labelCol="label", 
            rawPredictionCol="rawPrediction",
            metricName="areaUnderROC"
        )
        
        models_results = {}
        
        # 1. 逻辑回归
        print("\n=== 逻辑回归交叉验证 ===")
        lr = LogisticRegression(featuresCol="features", labelCol="label")
        lr_pipeline = Pipeline(stages=[assembler, scaler, lr])
        
        lr_param_grid = ParamGridBuilder() \
            .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
            .build()
        
        lr_cv = CrossValidator(
            estimator=lr_pipeline,
            estimatorParamMaps=lr_param_grid,
            evaluator=evaluator,
            numFolds=num_folds,
            seed=42
        )
        
        lr_model = lr_cv.fit(df)
        lr_best_score = max(lr_cv.avgMetrics)
        
        models_results['LogisticRegression'] = {
            'best_score': lr_best_score,
            'model': lr_model
        }
        
        print(f"逻辑回归最佳AUC: {lr_best_score:.4f}")
        
        # 2. 随机森林
        print("\n=== 随机森林交叉验证 ===")
        rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=42)
        rf_pipeline = Pipeline(stages=[assembler, rf])  # 随机森林不需要标准化
        
        rf_param_grid = ParamGridBuilder() \
            .addGrid(rf.numTrees, [50, 100]) \
            .addGrid(rf.maxDepth, [5, 10]) \
            .build()
        
        rf_cv = CrossValidator(
            estimator=rf_pipeline,
            estimatorParamMaps=rf_param_grid,
            evaluator=evaluator,
            numFolds=num_folds,
            seed=42
        )
        
        rf_model = rf_cv.fit(df)
        rf_best_score = max(rf_cv.avgMetrics)
        
        models_results['RandomForest'] = {
            'best_score': rf_best_score,
            'model': rf_model
        }
        
        print(f"随机森林最佳AUC: {rf_best_score:.4f}")
        
        # 3. 梯度提升树
        print("\n=== 梯度提升树交叉验证 ===")
        gbt = GBTClassifier(featuresCol="features", labelCol="label", seed=42)
        gbt_pipeline = Pipeline(stages=[assembler, gbt])
        
        gbt_param_grid = ParamGridBuilder() \
            .addGrid(gbt.maxIter, [50, 100]) \
            .addGrid(gbt.maxDepth, [3, 5]) \
            .build()
        
        gbt_cv = CrossValidator(
            estimator=gbt_pipeline,
            estimatorParamMaps=gbt_param_grid,
            evaluator=evaluator,
            numFolds=num_folds,
            seed=42
        )
        
        gbt_model = gbt_cv.fit(df)
        gbt_best_score = max(gbt_cv.avgMetrics)
        
        models_results['GBT'] = {
            'best_score': gbt_best_score,
            'model': gbt_model
        }
        
        print(f"梯度提升树最佳AUC: {gbt_best_score:.4f}")
        
        # 比较结果
        print("\n=== 模型比较结果 ===")
        sorted_models = sorted(models_results.items(), 
                              key=lambda x: x[1]['best_score'], 
                              reverse=True)
        
        for i, (model_name, result) in enumerate(sorted_models):
            print(f"{i+1}. {model_name}: AUC = {result['best_score']:.4f}")
        
        return models_results
    
    def visualize_cv_results(self, cv_results, manual_cv_results=None, model_comparison=None):
        """
        可视化交叉验证结果
        """
        import matplotlib.pyplot as plt
        import seaborn as sns
        import numpy as np
        import pandas as pd
        
        # 设置中文字体
        plt.rcParams['font.sans-serif'] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 创建图形
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('交叉验证分析结果', fontsize=16, fontweight='bold')
        
        # 1. 参数网格搜索热力图
        results = cv_results['all_results']
        
        # 创建参数网格数据
        reg_params = sorted(list(set([r['reg_param'] for r in results])))
        elastic_nets = sorted(list(set([r['elastic_net'] for r in results])))
        
        heatmap_data = np.zeros((len(elastic_nets), len(reg_params)))
        
        for result in results:
            i = elastic_nets.index(result['elastic_net'])
            j = reg_params.index(result['reg_param'])
            heatmap_data[i, j] = result['auc_score']
        
        im = axes[0, 0].imshow(heatmap_data, cmap='YlOrRd', aspect='auto')
        axes[0, 0].set_xticks(range(len(reg_params)))
        axes[0, 0].set_xticklabels([f'{p:.2f}' for p in reg_params])
        axes[0, 0].set_yticks(range(len(elastic_nets)))
        axes[0, 0].set_yticklabels([f'{p:.1f}' for p in elastic_nets])
        axes[0, 0].set_xlabel('正则化参数 (regParam)')
        axes[0, 0].set_ylabel('弹性网络参数 (elasticNet)')
        axes[0, 0].set_title('超参数网格搜索热力图')
        
        # 添加数值标注
        for i in range(len(elastic_nets)):
            for j in range(len(reg_params)):
                axes[0, 0].text(j, i, f'{heatmap_data[i, j]:.3f}',
                               ha='center', va='center', color='black')
        
        plt.colorbar(im, ax=axes[0, 0])
        
        # 2. 交叉验证分数分布
        cv_scores = cv_results['cv_scores']
        axes[0, 1].hist(cv_scores, bins=10, alpha=0.7, edgecolor='black')
        axes[0, 1].axvline(cv_results['best_score'], color='red', linestyle='--', 
                          label=f'最佳分数: {cv_results["best_score"]:.4f}')
        axes[0, 1].axvline(np.mean(cv_scores), color='green', linestyle='--', 
                          label=f'平均分数: {np.mean(cv_scores):.4f}')
        axes[0, 1].set_xlabel('AUC分数')
        axes[0, 1].set_ylabel('频次')
        axes[0, 1].set_title('交叉验证分数分布')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3)
        
        # 3. 参数vs性能关系
        reg_param_scores = {}
        for result in results:
            reg_param = result['reg_param']
            if reg_param not in reg_param_scores:
                reg_param_scores[reg_param] = []
            reg_param_scores[reg_param].append(result['auc_score'])
        
        reg_params_sorted = sorted(reg_param_scores.keys())
        avg_scores = [np.mean(reg_param_scores[rp]) for rp in reg_params_sorted]
        std_scores = [np.std(reg_param_scores[rp]) for rp in reg_params_sorted]
        
        axes[0, 2].errorbar(reg_params_sorted, avg_scores, yerr=std_scores, 
                           marker='o', capsize=5, capthick=2)
        axes[0, 2].set_xlabel('正则化参数')
        axes[0, 2].set_ylabel('平均AUC分数')
        axes[0, 2].set_title('正则化参数 vs 模型性能')
        axes[0, 2].set_xscale('log')
        axes[0, 2].grid(True, alpha=0.3)
        
        # 4. 手动交叉验证结果(如果有)
        if manual_cv_results:
            fold_results = manual_cv_results['fold_results']
            
            metrics = ['auc', 'accuracy', 'precision', 'recall', 'f1']
            metric_values = {metric: [r[metric] for r in fold_results] for metric in metrics}
            
            # 箱线图
            axes[1, 0].boxplot([metric_values[m] for m in metrics], 
                              labels=['AUC', '准确率', '精确率', '召回率', 'F1'])
            axes[1, 0].set_ylabel('分数')
            axes[1, 0].set_title('各折交叉验证指标分布')
            axes[1, 0].grid(True, alpha=0.3)
            
            # 折数vs性能
            folds = [r['fold'] for r in fold_results]
            auc_scores = [r['auc'] for r in fold_results]
            
            axes[1, 1].plot(folds, auc_scores, 'o-', linewidth=2, markersize=8)
            axes[1, 1].set_xlabel('折数')
            axes[1, 1].set_ylabel('AUC分数')
            axes[1, 1].set_title('各折AUC分数变化')
            axes[1, 1].grid(True, alpha=0.3)
            
            # 添加平均线
            avg_auc = manual_cv_results['avg_metrics']['avg_auc']
            axes[1, 1].axhline(avg_auc, color='red', linestyle='--', 
                              label=f'平均AUC: {avg_auc:.4f}')
            axes[1, 1].legend()
        
        # 5. 模型比较(如果有)
        if model_comparison:
            model_names = list(model_comparison.keys())
            model_scores = [model_comparison[name]['best_score'] for name in model_names]
            
            # 条形图
            bars = axes[1, 2].bar(model_names, model_scores, alpha=0.7)
            axes[1, 2].set_ylabel('最佳AUC分数')
            axes[1, 2].set_title('不同模型性能比较')
            axes[1, 2].tick_params(axis='x', rotation=45)
            
            # 添加数值标注
            for bar, score in zip(bars, model_scores):
                height = bar.get_height()
                axes[1, 2].text(bar.get_x() + bar.get_width()/2., height + 0.001,
                               f'{score:.4f}', ha='center', va='bottom')
            
            axes[1, 2].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # 输出详细分析
        print("\n=== 交叉验证分析报告 ===")
        print(f"最佳模型性能:")
        print(f"  - 最佳AUC分数: {cv_results['best_score']:.4f}")
        print(f"  - 最佳正则化参数: {cv_results['best_params']['reg_param']}")
        print(f"  - 最佳弹性网络参数: {cv_results['best_params']['elastic_net']}")
        
        print(f"\n参数搜索统计:")
        print(f"  - 平均AUC分数: {np.mean(cv_scores):.4f}")
        print(f"  - AUC分数标准差: {np.std(cv_scores):.4f}")
        print(f"  - 最佳分数排名: {sorted(cv_scores, reverse=True).index(cv_results['best_score']) + 1}/{len(cv_scores)}")
        
        if manual_cv_results:
            avg_metrics = manual_cv_results['avg_metrics']
            print(f"\n手动交叉验证结果:")
            print(f"  - AUC: {avg_metrics['avg_auc']:.4f} ± {avg_metrics['std_auc']:.4f}")
            print(f"  - 准确率: {avg_metrics['avg_accuracy']:.4f} ± {avg_metrics['std_accuracy']:.4f}")
            print(f"  - 精确率: {avg_metrics['avg_precision']:.4f} ± {avg_metrics['std_precision']:.4f}")
            print(f"  - 召回率: {avg_metrics['avg_recall']:.4f} ± {avg_metrics['std_recall']:.4f}")
            print(f"  - F1分数: {avg_metrics['avg_f1']:.4f} ± {avg_metrics['std_f1']:.4f}")
        
        if model_comparison:
            print(f"\n模型比较结果:")
            sorted_models = sorted(model_comparison.items(), 
                                  key=lambda x: x[1]['best_score'], 
                                  reverse=True)
            for i, (model_name, result) in enumerate(sorted_models):
                print(f"  {i+1}. {model_name}: {result['best_score']:.4f}")
        
        return {
            'parameter_analysis': {
                'best_reg_param': cv_results['best_params']['reg_param'],
                'best_elastic_net': cv_results['best_params']['elastic_net'],
                'score_distribution': {
                    'mean': np.mean(cv_scores),
                    'std': np.std(cv_scores),
                    'min': min(cv_scores),
                    'max': max(cv_scores)
                }
            },
            'manual_cv_summary': manual_cv_results['avg_metrics'] if manual_cv_results else None,
            'model_ranking': sorted_models if model_comparison else None
        }

# 演示代码
if __name__ == "__main__":
    # 创建演示实例
    demo = CrossValidationDemo()
    
    # 1. 创建分类数据集
    print("=== 创建分类数据集 ===")
    df = demo.create_classification_dataset(n_samples=10000)
    
    # 2. 逻辑回归交叉验证
    print("\n=== 逻辑回归交叉验证 ===")
    cv_results = demo.cross_validation_logistic(df, num_folds=5)
    
    # 3. 手动交叉验证
    print("\n=== 手动交叉验证 ===")
    manual_cv_results = demo.manual_cross_validation(df, num_folds=5)
    
    # 4. 模型比较
    print("\n=== 模型比较 ===")
    model_comparison = demo.compare_models_cv(df, num_folds=5)
    
    # 5. 可视化结果
    demo.visualize_cv_results(cv_results, manual_cv_results, model_comparison)
    
    print("\n=== 交叉验证演示完成 ===")

总结:

交叉验证是机器学习中评估模型泛化能力的重要技术:

  1. 优点

    • 充分利用有限的数据
    • 提供模型性能的可靠估计
    • 减少过拟合风险
    • 帮助选择最佳超参数
  2. 注意事项

    • 计算成本较高
    • 需要合理选择折数
    • 数据分布要保持一致
    • 时间序列数据需要特殊处理
  3. 最佳实践

    • 通常使用5折或10折交叉验证
    • 结合网格搜索进行超参数优化
    • 使用分层采样保持类别平衡
    • 设置随机种子确保结果可重现

7.2 模型选择与集成

模型选择和集成学习可以进一步提升预测性能。

from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col, when, avg as spark_avg

class EnsembleDemo:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("EnsembleDemo") \
            .getOrCreate()
    
    def create_ensemble_dataset(self, n_samples=10000):
        """
        创建集成学习数据集
        """
        import numpy as np
        from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType
        
        # 生成更复杂的数据集
        np.random.seed(42)
        
        # 多个特征
        features = {}
        for i in range(10):
            features[f'feature_{i}'] = np.random.normal(0, 1, n_samples)
        
        # 添加一些非线性关系
        features['feature_10'] = features['feature_0'] ** 2
        features['feature_11'] = features['feature_1'] * features['feature_2']
        features['feature_12'] = np.sin(features['feature_3'])
        
        # 生成标签
        logit = (2 * features['feature_0'] + 
                1.5 * features['feature_1'] - 
                0.8 * features['feature_2'] + 
                0.5 * features['feature_10'] + 
                0.3 * features['feature_11'] + 
                np.random.normal(0, 0.5, n_samples))
        
        probability = 1 / (1 + np.exp(-logit))
        labels = (np.random.random(n_samples) < probability).astype(int)
        
        # 创建schema
        fields = [StructField(f'feature_{i}', DoubleType(), True) for i in range(13)]
        fields.append(StructField('label', IntegerType(), True))
        schema = StructType(fields)
        
        # 准备数据
        data_list = []
        for i in range(n_samples):
            row = [features[f'feature_{j}'][i] for j in range(13)] + [int(labels[i])]
            data_list.append(tuple(row))
        
        df = self.spark.createDataFrame(data_list, schema)
        
        print("集成学习数据集概览:")
         df.show(10)
         
         return df
    
    def voting_ensemble(self, df):
        """
        投票集成方法
        """
        from pyspark.ml.feature import VectorAssembler
        from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
        from pyspark.ml.evaluation import BinaryClassificationEvaluator
        from pyspark.sql.functions import col, when, round as spark_round
        
        # 特征工程
        feature_cols = [f'feature_{i}' for i in range(13)]
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
        df_assembled = assembler.transform(df)
        
        # 分割数据
        train_df, test_df = df_assembled.randomSplit([0.8, 0.2], seed=42)
        
        print(f"训练集大小: {train_df.count()}")
        print(f"测试集大小: {test_df.count()}")
        
        # 1. 逻辑回归
        lr = LogisticRegression(featuresCol="features", labelCol="label", seed=42)
        lr_model = lr.fit(train_df)
        lr_predictions = lr_model.transform(test_df)
        
        # 2. 随机森林
        rf = RandomForestClassifier(featuresCol="features", labelCol="label", 
                                   numTrees=100, seed=42)
        rf_model = rf.fit(train_df)
        rf_predictions = rf_model.transform(test_df)
        
        # 3. 梯度提升树
        gbt = GBTClassifier(featuresCol="features", labelCol="label", 
                           maxIter=100, seed=42)
        gbt_model = gbt.fit(train_df)
        gbt_predictions = gbt_model.transform(test_df)
        
        # 评估单个模型
        evaluator = BinaryClassificationEvaluator(labelCol="label", 
                                                 rawPredictionCol="rawPrediction")
        
        lr_auc = evaluator.evaluate(lr_predictions)
        rf_auc = evaluator.evaluate(rf_predictions)
        gbt_auc = evaluator.evaluate(gbt_predictions)
        
        print(f"\n单个模型性能:")
        print(f"逻辑回归 AUC: {lr_auc:.4f}")
        print(f"随机森林 AUC: {rf_auc:.4f}")
        print(f"梯度提升树 AUC: {gbt_auc:.4f}")
        
        # 硬投票(多数投票)
        # 合并预测结果
        voting_df = lr_predictions.select("label", "prediction").withColumnRenamed("prediction", "lr_pred") \
            .join(rf_predictions.select("prediction").withColumnRenamed("prediction", "rf_pred"), 
                  lr_predictions.select("prediction").withColumnRenamed("prediction", "lr_pred").rdd.zipWithIndex().map(lambda x: x[1]) == 
                  rf_predictions.select("prediction").withColumnRenamed("prediction", "rf_pred").rdd.zipWithIndex().map(lambda x: x[1])) \
            .join(gbt_predictions.select("prediction").withColumnRenamed("prediction", "gbt_pred"),
                  lr_predictions.select("prediction").withColumnRenamed("prediction", "lr_pred").rdd.zipWithIndex().map(lambda x: x[1]) == 
                  gbt_predictions.select("prediction").withColumnRenamed("prediction", "gbt_pred").rdd.zipWithIndex().map(lambda x: x[1]))
        
        # 简化的投票实现
        from pyspark.sql.functions import monotonically_increasing_id
        
        lr_indexed = lr_predictions.select("label", "prediction", "probability").withColumn("id", monotonically_increasing_id()).withColumnRenamed("prediction", "lr_pred").withColumnRenamed("probability", "lr_prob")
        rf_indexed = rf_predictions.select("prediction", "probability").withColumn("id", monotonically_increasing_id()).withColumnRenamed("prediction", "rf_pred").withColumnRenamed("probability", "rf_prob")
        gbt_indexed = gbt_predictions.select("prediction", "probability").withColumn("id", monotonically_increasing_id()).withColumnRenamed("prediction", "gbt_pred").withColumnRenamed("probability", "gbt_prob")
        
        # 合并预测结果
        ensemble_df = lr_indexed.join(rf_indexed, "id").join(gbt_indexed, "id")
        
        # 硬投票
        ensemble_df = ensemble_df.withColumn(
            "hard_vote",
            when((col("lr_pred") + col("rf_pred") + col("gbt_pred")) >= 2, 1.0).otherwise(0.0)
        )
        
        # 软投票(平均概率)
        from pyspark.sql.functions import udf
        from pyspark.sql.types import DoubleType
        import numpy as np
        
        def avg_probability(lr_prob, rf_prob, gbt_prob):
            # 提取正类概率
            lr_pos = float(lr_prob[1])
            rf_pos = float(rf_prob[1])
            gbt_pos = float(gbt_prob[1])
            return (lr_pos + rf_pos + gbt_pos) / 3.0
        
        avg_prob_udf = udf(avg_probability, DoubleType())
        
        ensemble_df = ensemble_df.withColumn(
            "soft_vote_prob",
            avg_prob_udf(col("lr_prob"), col("rf_prob"), col("gbt_prob"))
        ).withColumn(
            "soft_vote",
            when(col("soft_vote_prob") >= 0.5, 1.0).otherwise(0.0)
        )
        
        # 评估集成结果
        def calculate_accuracy(df, pred_col):
            correct = df.filter(col("label") == col(pred_col)).count()
            total = df.count()
            return correct / total
        
        hard_vote_acc = calculate_accuracy(ensemble_df, "hard_vote")
        soft_vote_acc = calculate_accuracy(ensemble_df, "soft_vote")
        
        print(f"\n集成学习结果:")
        print(f"硬投票准确率: {hard_vote_acc:.4f}")
        print(f"软投票准确率: {soft_vote_acc:.4f}")
        
        return {
            'individual_models': {
                'lr': {'model': lr_model, 'auc': lr_auc},
                'rf': {'model': rf_model, 'auc': rf_auc},
                'gbt': {'model': gbt_model, 'auc': gbt_auc}
            },
            'ensemble_results': ensemble_df,
            'hard_vote_accuracy': hard_vote_acc,
            'soft_vote_accuracy': soft_vote_acc,
            'test_data': test_df
        }
    
    def bagging_ensemble(self, df, n_models=5):
        """
        Bagging集成方法
        """
        from pyspark.ml.feature import VectorAssembler
        from pyspark.ml.classification import RandomForestClassifier
        from pyspark.ml.evaluation import BinaryClassificationEvaluator
        from pyspark.sql.functions import col, avg as spark_avg
        import random
        
        # 特征工程
        feature_cols = [f'feature_{i}' for i in range(13)]
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
        df_assembled = assembler.transform(df)
        
        # 分割数据
        train_df, test_df = df_assembled.randomSplit([0.8, 0.2], seed=42)
        
        models = []
        predictions_list = []
        
        print(f"训练{n_models}个Bagging模型...")
        
        for i in range(n_models):
            print(f"训练模型 {i+1}/{n_models}")
            
            # Bootstrap采样
            bootstrap_df = train_df.sample(withReplacement=True, fraction=1.0, seed=42+i)
            
            # 训练随机森林(每个模型使用不同的随机种子)
            rf = RandomForestClassifier(
                featuresCol="features", 
                labelCol="label",
                numTrees=50,  # 减少每个模型的树数量
                maxDepth=10,
                seed=42+i
            )
            
            model = rf.fit(bootstrap_df)
            models.append(model)
            
            # 预测
            predictions = model.transform(test_df)
            predictions_list.append(predictions)
        
        # 集成预测结果
        print("\n集成预测结果...")
        
        # 计算平均预测概率
        ensemble_df = predictions_list[0].select("label", "features")
        
        # 添加每个模型的预测概率
        for i, pred_df in enumerate(predictions_list):
            ensemble_df = ensemble_df.join(
                pred_df.select("features", "probability").withColumnRenamed("probability", f"prob_{i}"),
                "features"
            )
        
        # 计算平均概率和最终预测
        from pyspark.sql.functions import udf
        from pyspark.sql.types import DoubleType
        
        def avg_probabilities(*probs):
            # 提取正类概率并计算平均值
            pos_probs = [float(prob[1]) for prob in probs]
            return sum(pos_probs) / len(pos_probs)
        
        avg_prob_udf = udf(avg_probabilities, DoubleType())
        
        prob_cols = [col(f"prob_{i}") for i in range(n_models)]
        ensemble_df = ensemble_df.withColumn(
            "avg_probability",
            avg_prob_udf(*prob_cols)
        ).withColumn(
            "prediction",
            when(col("avg_probability") >= 0.5, 1.0).otherwise(0.0)
        )
        
        # 评估结果
        def calculate_accuracy(df, pred_col):
            correct = df.filter(col("label") == col(pred_col)).count()
            total = df.count()
            return correct / total
        
        # 单个模型性能
        evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction")
        individual_aucs = []
        individual_accs = []
        
        for i, pred_df in enumerate(predictions_list):
            auc = evaluator.evaluate(pred_df)
            acc = calculate_accuracy(pred_df, "prediction")
            individual_aucs.append(auc)
            individual_accs.append(acc)
        
        # 集成模型性能
        ensemble_acc = calculate_accuracy(ensemble_df, "prediction")
        
        print(f"\nBagging集成结果:")
        print(f"单个模型平均AUC: {sum(individual_aucs)/len(individual_aucs):.4f}")
        print(f"单个模型平均准确率: {sum(individual_accs)/len(individual_accs):.4f}")
        print(f"集成模型准确率: {ensemble_acc:.4f}")
        
        return {
            'models': models,
            'individual_aucs': individual_aucs,
            'individual_accuracies': individual_accs,
            'ensemble_accuracy': ensemble_acc,
            'ensemble_predictions': ensemble_df,
            'test_data': test_df
        }
    
    def stacking_ensemble(self, df):
        """
        Stacking集成方法
        """
        from pyspark.ml.feature import VectorAssembler
        from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
        from pyspark.ml.evaluation import BinaryClassificationEvaluator
        from pyspark.sql.functions import col
        
        # 特征工程
        feature_cols = [f'feature_{i}' for i in range(13)]
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
        df_assembled = assembler.transform(df)
        
        # 分割数据:训练集、验证集、测试集
        train_df, temp_df = df_assembled.randomSplit([0.6, 0.4], seed=42)
        val_df, test_df = temp_df.randomSplit([0.5, 0.5], seed=42)
        
        print(f"训练集大小: {train_df.count()}")
        print(f"验证集大小: {val_df.count()}")
        print(f"测试集大小: {test_df.count()}")
        
        # 第一层:基学习器
        print("\n训练基学习器...")
        
        # 1. 逻辑回归
        lr = LogisticRegression(featuresCol="features", labelCol="label", seed=42)
        lr_model = lr.fit(train_df)
        
        # 2. 随机森林
        rf = RandomForestClassifier(featuresCol="features", labelCol="label", 
                                   numTrees=100, seed=42)
        rf_model = rf.fit(train_df)
        
        # 3. 梯度提升树
        gbt = GBTClassifier(featuresCol="features", labelCol="label", 
                           maxIter=100, seed=42)
        gbt_model = gbt.fit(train_df)
        
        # 在验证集上生成预测(用于训练元学习器)
        lr_val_pred = lr_model.transform(val_df)
        rf_val_pred = rf_model.transform(val_df)
        gbt_val_pred = gbt_model.transform(val_df)
        
        # 提取预测概率作为元特征
        from pyspark.sql.functions import udf, monotonically_increasing_id
        from pyspark.sql.types import DoubleType
        
        def extract_prob(probability):
            return float(probability[1])  # 正类概率
        
        extract_prob_udf = udf(extract_prob, DoubleType())
        
        # 创建元特征数据集
        meta_features = lr_val_pred.select("label").withColumn("id", monotonically_increasing_id()) \
            .join(lr_val_pred.select(extract_prob_udf("probability").alias("lr_prob")).withColumn("id", monotonically_increasing_id()), "id") \
            .join(rf_val_pred.select(extract_prob_udf("probability").alias("rf_prob")).withColumn("id", monotonically_increasing_id()), "id") \
            .join(gbt_val_pred.select(extract_prob_udf("probability").alias("gbt_prob")).withColumn("id", monotonically_increasing_id()), "id")
        
        # 组装元特征
        meta_assembler = VectorAssembler(inputCols=["lr_prob", "rf_prob", "gbt_prob"], 
                                        outputCol="meta_features")
        meta_df = meta_assembler.transform(meta_features)
        
        # 第二层:元学习器
        print("训练元学习器...")
        meta_learner = LogisticRegression(featuresCol="meta_features", labelCol="label", seed=42)
        meta_model = meta_learner.fit(meta_df)
        
        # 在测试集上进行预测
        print("\n在测试集上预测...")
        
        # 基学习器预测
        lr_test_pred = lr_model.transform(test_df)
        rf_test_pred = rf_model.transform(test_df)
        gbt_test_pred = gbt_model.transform(test_df)
        
        # 创建测试集元特征
        test_meta_features = lr_test_pred.select("label").withColumn("id", monotonically_increasing_id()) \
            .join(lr_test_pred.select(extract_prob_udf("probability").alias("lr_prob")).withColumn("id", monotonically_increasing_id()), "id") \
            .join(rf_test_pred.select(extract_prob_udf("probability").alias("rf_prob")).withColumn("id", monotonically_increasing_id()), "id") \
            .join(gbt_test_pred.select(extract_prob_udf("probability").alias("gbt_prob")).withColumn("id", monotonically_increasing_id()), "id")
        
        test_meta_df = meta_assembler.transform(test_meta_features)
        
        # 元学习器最终预测
        final_predictions = meta_model.transform(test_meta_df)
        
        # 评估结果
        evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction")
        
        lr_auc = evaluator.evaluate(lr_test_pred)
        rf_auc = evaluator.evaluate(rf_test_pred)
        gbt_auc = evaluator.evaluate(gbt_test_pred)
        stacking_auc = evaluator.evaluate(final_predictions)
        
        def calculate_accuracy(df, pred_col="prediction"):
            correct = df.filter(col("label") == col(pred_col)).count()
            total = df.count()
            return correct / total
        
        lr_acc = calculate_accuracy(lr_test_pred)
        rf_acc = calculate_accuracy(rf_test_pred)
        gbt_acc = calculate_accuracy(gbt_test_pred)
        stacking_acc = calculate_accuracy(final_predictions)
        
        print(f"\nStacking集成结果:")
        print(f"逻辑回归 - AUC: {lr_auc:.4f}, 准确率: {lr_acc:.4f}")
        print(f"随机森林 - AUC: {rf_auc:.4f}, 准确率: {rf_acc:.4f}")
        print(f"梯度提升树 - AUC: {gbt_auc:.4f}, 准确率: {gbt_acc:.4f}")
        print(f"Stacking - AUC: {stacking_auc:.4f}, 准确率: {stacking_acc:.4f}")
        
        return {
            'base_models': {
                'lr': {'model': lr_model, 'auc': lr_auc, 'accuracy': lr_acc},
                'rf': {'model': rf_model, 'auc': rf_auc, 'accuracy': rf_acc},
                'gbt': {'model': gbt_model, 'auc': gbt_auc, 'accuracy': gbt_acc}
            },
            'meta_model': meta_model,
            'stacking_auc': stacking_auc,
            'stacking_accuracy': stacking_acc,
            'final_predictions': final_predictions,
             'test_data': test_df
         }
    
    def visualize_ensemble_results(self, voting_results, bagging_results, stacking_results):
        """
        可视化集成学习结果
        """
        import matplotlib.pyplot as plt
        import seaborn as sns
        import pandas as pd
        import numpy as np
        from sklearn.metrics import confusion_matrix, classification_report
        
        plt.style.use('default')
        fig, axes = plt.subplots(3, 3, figsize=(18, 15))
        fig.suptitle('集成学习方法对比分析', fontsize=16, fontweight='bold')
        
        # 1. 模型性能对比
        ax1 = axes[0, 0]
        methods = ['逻辑回归', '随机森林', '梯度提升树', '硬投票', '软投票', 'Bagging', 'Stacking']
        
        # 收集AUC分数
        lr_auc = voting_results['individual_models']['lr']['auc']
        rf_auc = voting_results['individual_models']['rf']['auc']
        gbt_auc = voting_results['individual_models']['gbt']['auc']
        
        aucs = [lr_auc, rf_auc, gbt_auc, 
                voting_results['hard_vote_accuracy'], 
                voting_results['soft_vote_accuracy'],
                bagging_results['ensemble_accuracy'],
                stacking_results['stacking_accuracy']]
        
        colors = ['lightblue', 'lightgreen', 'lightcoral', 'gold', 'orange', 'purple', 'red']
        bars = ax1.bar(methods, aucs, color=colors, alpha=0.7)
        ax1.set_title('不同方法性能对比', fontweight='bold')
        ax1.set_ylabel('准确率/AUC')
        ax1.set_ylim(0, 1)
        
        # 添加数值标签
        for bar, auc in zip(bars, aucs):
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                    f'{auc:.3f}', ha='center', va='bottom', fontweight='bold')
        
        plt.setp(ax1.get_xticklabels(), rotation=45, ha='right')
        
        # 2. Bagging模型性能分布
        ax2 = axes[0, 1]
        individual_accs = bagging_results['individual_accuracies']
        ax2.boxplot([individual_accs], labels=['Bagging模型'])
        ax2.scatter([1] * len(individual_accs), individual_accs, alpha=0.6, color='red')
        ax2.axhline(y=bagging_results['ensemble_accuracy'], color='blue', 
                   linestyle='--', label=f'集成准确率: {bagging_results["ensemble_accuracy"]:.3f}')
        ax2.set_title('Bagging个体模型性能分布', fontweight='bold')
        ax2.set_ylabel('准确率')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # 3. 投票方法对比
        ax3 = axes[0, 2]
        vote_methods = ['硬投票', '软投票']
        vote_accs = [voting_results['hard_vote_accuracy'], voting_results['soft_vote_accuracy']]
        bars = ax3.bar(vote_methods, vote_accs, color=['skyblue', 'lightgreen'], alpha=0.7)
        ax3.set_title('投票方法对比', fontweight='bold')
        ax3.set_ylabel('准确率')
        ax3.set_ylim(0, 1)
        
        for bar, acc in zip(bars, vote_accs):
            height = bar.get_height()
            ax3.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                    f'{acc:.3f}', ha='center', va='bottom', fontweight='bold')
        
        # 4. Stacking各层性能
        ax4 = axes[1, 0]
        base_models = ['逻辑回归', '随机森林', '梯度提升树']
        base_accs = [stacking_results['base_models']['lr']['accuracy'],
                    stacking_results['base_models']['rf']['accuracy'],
                    stacking_results['base_models']['gbt']['accuracy']]
        
        x_pos = np.arange(len(base_models))
        bars1 = ax4.bar(x_pos - 0.2, base_accs, 0.4, label='基学习器', alpha=0.7, color='lightblue')
        bars2 = ax4.bar(x_pos + 0.2, [stacking_results['stacking_accuracy']] * 3, 0.4, 
                       label='Stacking', alpha=0.7, color='orange')
        
        ax4.set_title('Stacking基学习器vs集成结果', fontweight='bold')
        ax4.set_ylabel('准确率')
        ax4.set_xticks(x_pos)
        ax4.set_xticklabels(base_models)
        ax4.legend()
        ax4.grid(True, alpha=0.3)
        
        # 5. 集成方法复杂度对比
        ax5 = axes[1, 1]
        ensemble_methods = ['投票', 'Bagging', 'Stacking']
        complexity_scores = [2, 4, 5]  # 相对复杂度评分
        performance_scores = [
            max(voting_results['hard_vote_accuracy'], voting_results['soft_vote_accuracy']),
            bagging_results['ensemble_accuracy'],
            stacking_results['stacking_accuracy']
        ]
        
        scatter = ax5.scatter(complexity_scores, performance_scores, 
                             s=[100, 150, 200], alpha=0.7, 
                             c=['blue', 'green', 'red'])
        
        for i, method in enumerate(ensemble_methods):
            ax5.annotate(method, (complexity_scores[i], performance_scores[i]),
                        xytext=(5, 5), textcoords='offset points')
        
        ax5.set_title('复杂度vs性能权衡', fontweight='bold')
        ax5.set_xlabel('模型复杂度')
        ax5.set_ylabel('性能')
        ax5.grid(True, alpha=0.3)
        
        # 6. 预测概率分布对比
        ax6 = axes[1, 2]
        
        # 获取软投票的概率分布
        voting_probs = voting_results['ensemble_results'].select('soft_vote_prob').rdd.map(lambda x: x[0]).collect()
        stacking_probs = stacking_results['final_predictions'].select('probability').rdd.map(lambda x: float(x[0][1])).collect()
        
        ax6.hist(voting_probs, bins=20, alpha=0.5, label='软投票', color='blue', density=True)
        ax6.hist(stacking_probs, bins=20, alpha=0.5, label='Stacking', color='red', density=True)
        ax6.set_title('预测概率分布对比', fontweight='bold')
        ax6.set_xlabel('预测概率')
        ax6.set_ylabel('密度')
        ax6.legend()
        ax6.grid(True, alpha=0.3)
        
        # 7. 模型稳定性分析
        ax7 = axes[2, 0]
        bagging_aucs = bagging_results['individual_aucs']
        ax7.plot(range(1, len(bagging_aucs) + 1), bagging_aucs, 'o-', color='blue', alpha=0.7)
        ax7.axhline(y=np.mean(bagging_aucs), color='red', linestyle='--', 
                   label=f'平均AUC: {np.mean(bagging_aucs):.3f}')
        ax7.fill_between(range(1, len(bagging_aucs) + 1), 
                        np.mean(bagging_aucs) - np.std(bagging_aucs),
                        np.mean(bagging_aucs) + np.std(bagging_aucs),
                        alpha=0.2, color='red')
        ax7.set_title('Bagging模型稳定性', fontweight='bold')
        ax7.set_xlabel('模型编号')
        ax7.set_ylabel('AUC')
        ax7.legend()
        ax7.grid(True, alpha=0.3)
        
        # 8. 集成效果提升分析
        ax8 = axes[2, 1]
        base_avg_acc = np.mean([lr_auc, rf_auc, gbt_auc])
        ensemble_accs = [
            max(voting_results['hard_vote_accuracy'], voting_results['soft_vote_accuracy']),
            bagging_results['ensemble_accuracy'],
            stacking_results['stacking_accuracy']
        ]
        
        improvements = [(acc - base_avg_acc) * 100 for acc in ensemble_accs]
        colors = ['green' if imp > 0 else 'red' for imp in improvements]
        
        bars = ax8.bar(ensemble_methods, improvements, color=colors, alpha=0.7)
        ax8.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
        ax8.set_title('相对基准模型的性能提升', fontweight='bold')
        ax8.set_ylabel('性能提升 (%)')
        
        for bar, imp in zip(bars, improvements):
            height = bar.get_height()
            ax8.text(bar.get_x() + bar.get_width()/2., height + (0.1 if height > 0 else -0.3),
                    f'{imp:.1f}%', ha='center', va='bottom' if height > 0 else 'top', 
                    fontweight='bold')
        
        # 9. 综合评估雷达图
        ax9 = axes[2, 2]
        
        # 评估维度
        categories = ['准确率', '稳定性', '可解释性', '计算效率', '泛化能力']
        
        # 各方法评分 (1-5分)
        voting_scores = [4, 3, 4, 5, 3]
        bagging_scores = [4, 5, 2, 3, 4]
        stacking_scores = [5, 3, 1, 2, 5]
        
        # 角度
        angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
        angles += angles[:1]  # 闭合
        
        # 数据闭合
        voting_scores += voting_scores[:1]
        bagging_scores += bagging_scores[:1]
        stacking_scores += stacking_scores[:1]
        
        ax9.plot(angles, voting_scores, 'o-', linewidth=2, label='投票', color='blue')
        ax9.fill(angles, voting_scores, alpha=0.25, color='blue')
        ax9.plot(angles, bagging_scores, 'o-', linewidth=2, label='Bagging', color='green')
        ax9.fill(angles, bagging_scores, alpha=0.25, color='green')
        ax9.plot(angles, stacking_scores, 'o-', linewidth=2, label='Stacking', color='red')
        ax9.fill(angles, stacking_scores, alpha=0.25, color='red')
        
        ax9.set_xticks(angles[:-1])
        ax9.set_xticklabels(categories)
        ax9.set_ylim(0, 5)
        ax9.set_title('集成方法综合评估', fontweight='bold')
        ax9.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
        ax9.grid(True)
        
        plt.tight_layout()
        plt.show()
        
        # 输出详细分析报告
        print("\n" + "="*60)
        print("集成学习方法详细分析报告")
        print("="*60)
        
        print(f"\n1. 基础模型性能:")
        print(f"   逻辑回归: AUC = {lr_auc:.4f}")
        print(f"   随机森林: AUC = {rf_auc:.4f}")
        print(f"   梯度提升树: AUC = {gbt_auc:.4f}")
        print(f"   平均性能: {np.mean([lr_auc, rf_auc, gbt_auc]):.4f}")
        
        print(f"\n2. 投票集成结果:")
        print(f"   硬投票准确率: {voting_results['hard_vote_accuracy']:.4f}")
        print(f"   软投票准确率: {voting_results['soft_vote_accuracy']:.4f}")
        print(f"   最佳投票方法: {'软投票' if voting_results['soft_vote_accuracy'] > voting_results['hard_vote_accuracy'] else '硬投票'}")
        
        print(f"\n3. Bagging集成结果:")
        print(f"   个体模型平均准确率: {np.mean(bagging_results['individual_accuracies']):.4f}")
        print(f"   个体模型标准差: {np.std(bagging_results['individual_accuracies']):.4f}")
        print(f"   集成模型准确率: {bagging_results['ensemble_accuracy']:.4f}")
        print(f"   性能提升: {(bagging_results['ensemble_accuracy'] - np.mean(bagging_results['individual_accuracies'])) * 100:.2f}%")
        
        print(f"\n4. Stacking集成结果:")
        print(f"   最佳基学习器: {max(stacking_results['base_models'].items(), key=lambda x: x[1]['accuracy'])[0]}")
        print(f"   Stacking准确率: {stacking_results['stacking_accuracy']:.4f}")
        print(f"   相比最佳基学习器提升: {(stacking_results['stacking_accuracy'] - max([v['accuracy'] for v in stacking_results['base_models'].values()])) * 100:.2f}%")
        
        print(f"\n5. 方法选择建议:")
        best_method = max([
            ('投票', max(voting_results['hard_vote_accuracy'], voting_results['soft_vote_accuracy'])),
            ('Bagging', bagging_results['ensemble_accuracy']),
            ('Stacking', stacking_results['stacking_accuracy'])
        ], key=lambda x: x[1])
        
        print(f"   最佳方法: {best_method[0]} (准确率: {best_method[1]:.4f})")
        
        if best_method[0] == '投票':
            print(f"   - 优点: 简单易实现,计算效率高,可解释性好")
            print(f"   - 适用场景: 基础模型性能相近,需要快速部署")
        elif best_method[0] == 'Bagging':
            print(f"   - 优点: 减少过拟合,提高模型稳定性")
            print(f"   - 适用场景: 基础模型容易过拟合,需要稳定性")
        else:
            print(f"   - 优点: 性能最优,能学习模型间的复杂关系")
            print(f"   - 适用场景: 对性能要求极高,可接受较高复杂度")

# 演示代码
if __name__ == "__main__":
    # 创建Spark会话
    spark = SparkSession.builder \
        .appName("EnsembleLearningDemo") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .getOrCreate()
    
    try:
        print("=== Spark MLlib 集成学习演示 ===")
        
        # 创建演示实例
        demo = EnsembleDemo(spark)
        
        # 1. 创建数据集
        print("\n1. 创建集成学习数据集...")
        df = demo.create_ensemble_dataset()
        
        # 2. 投票集成
        print("\n2. 投票集成方法...")
        voting_results = demo.voting_ensemble(df)
        
        # 3. Bagging集成
        print("\n3. Bagging集成方法...")
        bagging_results = demo.bagging_ensemble(df, n_models=5)
        
        # 4. Stacking集成
        print("\n4. Stacking集成方法...")
        stacking_results = demo.stacking_ensemble(df)
        
        # 5. 可视化结果
        print("\n5. 可视化集成学习结果...")
        demo.visualize_ensemble_results(voting_results, bagging_results, stacking_results)
        
        print("\n=== 集成学习演示完成 ===")
        
    except Exception as e:
        print(f"演示过程中出现错误: {str(e)}")
        import traceback
        traceback.print_exc()
    
    finally:
        spark.stop()

集成学习总结

优缺点对比

投票集成 (Voting) - 优点: 简单易实现,计算效率高,可解释性好 - 缺点: 性能提升有限,依赖基础模型质量 - 适用场景: 基础模型性能相近,需要快速部署

Bagging集成 - 优点: 减少过拟合,提高模型稳定性,并行训练 - 缺点: 可能降低模型的偏差减少能力 - 适用场景: 基础模型容易过拟合,需要稳定性

Stacking集成 - 优点: 性能最优,能学习模型间的复杂关系 - 缺点: 计算复杂度高,容易过拟合,可解释性差 - 适用场景: 对性能要求极高,可接受较高复杂度

业务建议

  1. 模型选择策略

    • 性能优先: 选择Stacking
    • 稳定性优先: 选择Bagging
    • 效率优先: 选择投票
  2. 实际应用考虑

    • 数据量大: 优先考虑Bagging的并行性
    • 实时预测: 选择投票方法
    • 离线批处理: 可以使用Stacking
  3. 模型维护

    • 定期评估基础模型性能
    • 监控集成效果的稳定性
    • 根据业务需求调整集成策略

9. 实时机器学习

9.1 Spark Streaming + MLlib

from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time

class StreamingMLDemo:
    def __init__(self, spark):
        self.spark = spark
        self.sc = spark.sparkContext
        
    def create_streaming_data(self):
        """
        创建流式数据源
        """
        # 模拟实时数据流
        schema = StructType([
            StructField("user_id", IntegerType(), True),
            StructField("age", IntegerType(), True),
            StructField("income", DoubleType(), True),
            StructField("credit_score", IntegerType(), True),
            StructField("loan_amount", DoubleType(), True),
            StructField("timestamp", TimestampType(), True)
        ])
        
        # 创建流式DataFrame
        streaming_df = self.spark \
            .readStream \
            .format("rate") \
            .option("rowsPerSecond", 10) \
            .load() \
            .select(
                col("value").alias("user_id"),
                (rand() * 50 + 20).cast("int").alias("age"),
                (rand() * 80000 + 20000).alias("income"),
                (rand() * 300 + 500).cast("int").alias("credit_score"),
                (rand() * 50000 + 5000).alias("loan_amount"),
                col("timestamp")
            )
        
        return streaming_df
        
    def train_base_model(self):
        """
        训练基础模型
        """
        # 创建训练数据
        training_data = []
        for i in range(10000):
            age = np.random.randint(20, 70)
            income = np.random.uniform(20000, 100000)
            credit_score = np.random.randint(500, 800)
            loan_amount = np.random.uniform(5000, 55000)
            
            # 简单的标签生成逻辑
            risk_score = (age - 45) * 0.1 + (income - 50000) * 0.00001 + \
                        (credit_score - 650) * 0.01 + (loan_amount - 30000) * 0.00005
            label = 1 if risk_score > 0 else 0
            
            training_data.append((i, age, income, credit_score, loan_amount, label))
        
        # 创建DataFrame
        schema = StructType([
            StructField("user_id", IntegerType(), True),
            StructField("age", IntegerType(), True),
            StructField("income", DoubleType(), True),
            StructField("credit_score", IntegerType(), True),
            StructField("loan_amount", DoubleType(), True),
            StructField("label", IntegerType(), True)
        ])
        
        df = self.spark.createDataFrame(training_data, schema)
        
        # 特征工程
        feature_cols = ["age", "income", "credit_score", "loan_amount"]
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
        df_assembled = assembler.transform(df)
        
        # 训练模型
        lr = LogisticRegression(featuresCol="features", labelCol="label")
        model = lr.fit(df_assembled)
        
        print("基础模型训练完成")
        return model, assembler
        
    def streaming_prediction(self, model, assembler):
        """
        流式预测
        """
        # 创建流式数据
        streaming_df = self.create_streaming_data()
        
        # 特征工程
        feature_cols = ["age", "income", "credit_score", "loan_amount"]
        df_with_features = assembler.transform(streaming_df)
        
        # 预测
        predictions = model.transform(df_with_features)
        
        # 选择输出列
        output_df = predictions.select(
            "user_id", "age", "income", "credit_score", 
            "loan_amount", "prediction", "probability", "timestamp"
        )
        
        # 输出流
        query = output_df.writeStream \
            .outputMode("append") \
            .format("console") \
            .option("truncate", False) \
            .trigger(processingTime='5 seconds') \
            .start()
        
        return query
        
    def online_learning(self):
        """
        在线学习演示
        """
        print("在线学习功能演示...")
        
        # 初始模型
        base_model, assembler = self.train_base_model()
        
        # 模拟新数据到达和模型更新
        for batch in range(5):
            print(f"\n处理批次 {batch + 1}...")
            
            # 生成新的训练数据
            new_data = []
            for i in range(1000):
                age = np.random.randint(20, 70)
                income = np.random.uniform(20000, 100000)
                credit_score = np.random.randint(500, 800)
                loan_amount = np.random.uniform(5000, 55000)
                
                # 标签生成(可能包含概念漂移)
                drift_factor = batch * 0.1  # 模拟概念漂移
                risk_score = (age - 45) * (0.1 + drift_factor) + \
                           (income - 50000) * 0.00001 + \
                           (credit_score - 650) * 0.01 + \
                           (loan_amount - 30000) * 0.00005
                label = 1 if risk_score > 0 else 0
                
                new_data.append((i, age, income, credit_score, loan_amount, label))
            
            # 创建新数据DataFrame
            schema = StructType([
                StructField("user_id", IntegerType(), True),
                StructField("age", IntegerType(), True),
                StructField("income", DoubleType(), True),
                StructField("credit_score", IntegerType(), True),
                StructField("loan_amount", DoubleType(), True),
                StructField("label", IntegerType(), True)
            ])
            
            new_df = self.spark.createDataFrame(new_data, schema)
            new_df_assembled = assembler.transform(new_df)
            
            # 重新训练模型
            lr = LogisticRegression(featuresCol="features", labelCol="label")
            updated_model = lr.fit(new_df_assembled)
            
            # 评估模型性能
            predictions = updated_model.transform(new_df_assembled)
            evaluator = BinaryClassificationEvaluator(labelCol="label")
            auc = evaluator.evaluate(predictions)
            
            print(f"批次 {batch + 1} 模型AUC: {auc:.4f}")
            
            # 更新模型
            base_model = updated_model
            
            time.sleep(2)  # 模拟处理间隔
        
        return base_model
        
    def model_monitoring(self, model, assembler):
        """
        模型监控
        """
        print("\n模型性能监控...")
        
        # 创建测试数据流
        test_data = []
        for i in range(1000):
            age = np.random.randint(20, 70)
            income = np.random.uniform(20000, 100000)
            credit_score = np.random.randint(500, 800)
            loan_amount = np.random.uniform(5000, 55000)
            
            # 真实标签
            risk_score = (age - 45) * 0.1 + (income - 50000) * 0.00001 + \
                        (credit_score - 650) * 0.01 + (loan_amount - 30000) * 0.00005
            true_label = 1 if risk_score > 0 else 0
            
            test_data.append((i, age, income, credit_score, loan_amount, true_label))
        
        schema = StructType([
            StructField("user_id", IntegerType(), True),
            StructField("age", IntegerType(), True),
            StructField("income", DoubleType(), True),
            StructField("credit_score", IntegerType(), True),
            StructField("loan_amount", DoubleType(), True),
            StructField("true_label", IntegerType(), True)
        ])
        
        test_df = self.spark.createDataFrame(test_data, schema)
        test_df_assembled = assembler.transform(test_df)
        
        # 预测
        predictions = model.transform(test_df_assembled)
        
        # 计算性能指标
        evaluator = BinaryClassificationEvaluator(labelCol="true_label")
        auc = evaluator.evaluate(predictions)
        
        # 计算准确率
        correct_predictions = predictions.filter(col("prediction") == col("true_label")).count()
        total_predictions = predictions.count()
        accuracy = correct_predictions / total_predictions
        
        print(f"模型AUC: {auc:.4f}")
        print(f"模型准确率: {accuracy:.4f}")
        
        # 预测分布分析
        prediction_dist = predictions.groupBy("prediction").count().collect()
        print("\n预测分布:")
        for row in prediction_dist:
            print(f"  预测类别 {int(row['prediction'])}: {row['count']} 个样本")
        
        return {
             'auc': auc,
             'accuracy': accuracy,
             'predictions': predictions
         }
    
    def concept_drift_detection(self, model, assembler):
        """
        概念漂移检测
        """
        print("\n概念漂移检测演示...")
        
        # 基准性能(初始模型在原始数据上的性能)
        baseline_auc = 0.85  # 假设的基准AUC
        
        # 模拟不同程度的概念漂移
        drift_levels = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
        performance_history = []
        
        for drift_level in drift_levels:
            print(f"\n测试漂移程度: {drift_level}")
            
            # 生成带有概念漂移的测试数据
            test_data = []
            for i in range(1000):
                age = np.random.randint(20, 70)
                income = np.random.uniform(20000, 100000)
                credit_score = np.random.randint(500, 800)
                loan_amount = np.random.uniform(5000, 55000)
                
                # 引入概念漂移
                risk_score = (age - 45) * (0.1 + drift_level) + \
                           (income - 50000) * (0.00001 + drift_level * 0.00001) + \
                           (credit_score - 650) * (0.01 + drift_level * 0.01) + \
                           (loan_amount - 30000) * (0.00005 + drift_level * 0.00005)
                label = 1 if risk_score > 0 else 0
                
                test_data.append((i, age, income, credit_score, loan_amount, label))
            
            # 创建测试DataFrame
            schema = StructType([
                StructField("user_id", IntegerType(), True),
                StructField("age", IntegerType(), True),
                StructField("income", DoubleType(), True),
                StructField("credit_score", IntegerType(), True),
                StructField("loan_amount", DoubleType(), True),
                StructField("label", IntegerType(), True)
            ])
            
            test_df = self.spark.createDataFrame(test_data, schema)
            test_df_assembled = assembler.transform(test_df)
            
            # 预测和评估
            predictions = model.transform(test_df_assembled)
            evaluator = BinaryClassificationEvaluator(labelCol="label")
            current_auc = evaluator.evaluate(predictions)
            
            performance_history.append({
                'drift_level': drift_level,
                'auc': current_auc,
                'performance_drop': baseline_auc - current_auc
            })
            
            print(f"  当前AUC: {current_auc:.4f}")
            print(f"  性能下降: {(baseline_auc - current_auc):.4f}")
            
            # 漂移检测阈值
            drift_threshold = 0.05
            if (baseline_auc - current_auc) > drift_threshold:
                print(f"  ⚠️ 检测到概念漂移!性能下降超过阈值 {drift_threshold}")
            else:
                print(f"  ✅ 模型性能稳定")
        
        return performance_history
    
    def adaptive_learning(self):
        """
        自适应学习演示
        """
        print("\n自适应学习演示...")
        
        # 初始模型
        base_model, assembler = self.train_base_model()
        
        # 性能监控窗口
        window_size = 5
        performance_window = []
        adaptation_threshold = 0.03
        
        # 模拟连续的数据流和模型适应
        for time_step in range(20):
            print(f"\n时间步 {time_step + 1}:")
            
            # 生成当前时间步的数据(可能包含漂移)
            drift_factor = 0.02 * time_step if time_step > 10 else 0  # 第10步后开始漂移
            
            current_data = []
            for i in range(500):
                age = np.random.randint(20, 70)
                income = np.random.uniform(20000, 100000)
                credit_score = np.random.randint(500, 800)
                loan_amount = np.random.uniform(5000, 55000)
                
                risk_score = (age - 45) * (0.1 + drift_factor) + \
                           (income - 50000) * 0.00001 + \
                           (credit_score - 650) * 0.01 + \
                           (loan_amount - 30000) * 0.00005
                label = 1 if risk_score > 0 else 0
                
                current_data.append((i, age, income, credit_score, loan_amount, label))
            
            # 创建当前数据DataFrame
            schema = StructType([
                StructField("user_id", IntegerType(), True),
                StructField("age", IntegerType(), True),
                StructField("income", DoubleType(), True),
                StructField("credit_score", IntegerType(), True),
                StructField("loan_amount", DoubleType(), True),
                StructField("label", IntegerType(), True)
            ])
            
            current_df = self.spark.createDataFrame(current_data, schema)
            current_df_assembled = assembler.transform(current_df)
            
            # 评估当前模型性能
            predictions = base_model.transform(current_df_assembled)
            evaluator = BinaryClassificationEvaluator(labelCol="label")
            current_auc = evaluator.evaluate(predictions)
            
            # 更新性能窗口
            performance_window.append(current_auc)
            if len(performance_window) > window_size:
                performance_window.pop(0)
            
            print(f"  当前AUC: {current_auc:.4f}")
            
            # 检查是否需要模型适应
            if len(performance_window) == window_size:
                recent_avg = sum(performance_window) / len(performance_window)
                baseline_performance = 0.85  # 基准性能
                
                performance_drop = baseline_performance - recent_avg
                print(f"  近期平均AUC: {recent_avg:.4f}")
                print(f"  性能下降: {performance_drop:.4f}")
                
                if performance_drop > adaptation_threshold:
                    print(f"  🔄 触发模型重训练...")
                    
                    # 重新训练模型
                    lr = LogisticRegression(featuresCol="features", labelCol="label")
                    adapted_model = lr.fit(current_df_assembled)
                    
                    # 评估适应后的性能
                    adapted_predictions = adapted_model.transform(current_df_assembled)
                    adapted_auc = evaluator.evaluate(adapted_predictions)
                    
                    print(f"  适应后AUC: {adapted_auc:.4f}")
                    print(f"  性能提升: {(adapted_auc - current_auc):.4f}")
                    
                    # 更新模型
                    base_model = adapted_model
                    
                    # 重置性能窗口
                    performance_window = [adapted_auc]
                else:
                    print(f"  ✅ 模型性能稳定,无需适应")
            
            time.sleep(0.5)  # 模拟时间间隔
        
        return base_model
    
    def visualize_streaming_results(self, performance_history, concept_drift_results):
        """
        可视化流式学习结果
        """
        import matplotlib.pyplot as plt
        import numpy as np
        
        plt.style.use('default')
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle('实时机器学习分析', fontsize=16, fontweight='bold')
        
        # 1. 概念漂移对性能的影响
        ax1 = axes[0, 0]
        drift_levels = [item['drift_level'] for item in concept_drift_results]
        aucs = [item['auc'] for item in concept_drift_results]
        
        ax1.plot(drift_levels, aucs, 'o-', color='red', linewidth=2, markersize=8)
        ax1.axhline(y=0.85, color='blue', linestyle='--', label='基准性能')
        ax1.axhline(y=0.80, color='orange', linestyle='--', label='警告阈值')
        ax1.fill_between(drift_levels, aucs, 0.80, where=np.array(aucs) < 0.80, 
                        color='red', alpha=0.3, label='性能不足区域')
        
        ax1.set_title('概念漂移对模型性能的影响', fontweight='bold')
        ax1.set_xlabel('漂移程度')
        ax1.set_ylabel('AUC')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # 2. 性能下降趋势
        ax2 = axes[0, 1]
        performance_drops = [item['performance_drop'] for item in concept_drift_results]
        
        bars = ax2.bar(range(len(drift_levels)), performance_drops, 
                      color=['green' if drop < 0.05 else 'orange' if drop < 0.1 else 'red' 
                            for drop in performance_drops], alpha=0.7)
        
        ax2.axhline(y=0.05, color='orange', linestyle='--', label='漂移检测阈值')
        ax2.set_title('不同漂移程度下的性能下降', fontweight='bold')
        ax2.set_xlabel('漂移程度')
        ax2.set_ylabel('性能下降')
        ax2.set_xticks(range(len(drift_levels)))
        ax2.set_xticklabels([f'{level:.1f}' for level in drift_levels])
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # 3. 模拟实时性能监控
        ax3 = axes[1, 0]
        time_steps = list(range(1, 21))
        simulated_performance = []
        
        # 模拟性能数据
        for i in time_steps:
            if i <= 10:
                perf = 0.85 + np.random.normal(0, 0.01)  # 稳定期
            else:
                drift = 0.02 * (i - 10)
                perf = 0.85 - drift + np.random.normal(0, 0.01)  # 漂移期
            simulated_performance.append(max(0.6, min(0.9, perf)))
        
        ax3.plot(time_steps, simulated_performance, 'o-', color='blue', alpha=0.7)
        ax3.axhline(y=0.85, color='green', linestyle='--', label='目标性能')
        ax3.axhline(y=0.82, color='orange', linestyle='--', label='警告阈值')
        ax3.axvline(x=10, color='red', linestyle=':', label='漂移开始')
        
        # 标记重训练点
        retrain_points = [15, 18]  # 假设的重训练时间点
        for point in retrain_points:
            ax3.scatter(point, simulated_performance[point-1], color='red', s=100, 
                       marker='^', label='模型重训练' if point == retrain_points[0] else '')
        
        ax3.set_title('实时性能监控与自适应学习', fontweight='bold')
        ax3.set_xlabel('时间步')
        ax3.set_ylabel('模型性能 (AUC)')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        
        # 4. 流式学习架构图
        ax4 = axes[1, 1]
        ax4.text(0.5, 0.9, '流式机器学习架构', ha='center', va='center', 
                fontsize=14, fontweight='bold', transform=ax4.transAxes)
        
        # 绘制架构流程
        components = [
            (0.1, 0.7, '数据流'),
            (0.3, 0.7, '特征工程'),
            (0.5, 0.7, '模型预测'),
            (0.7, 0.7, '性能监控'),
            (0.9, 0.7, '结果输出'),
            (0.5, 0.4, '概念漂移\n检测'),
            (0.5, 0.1, '模型重训练')
        ]
        
        for x, y, text in components:
            if '检测' in text or '重训练' in text:
                color = 'lightcoral'
            else:
                color = 'lightblue'
            
            bbox = dict(boxstyle="round,pad=0.3", facecolor=color, alpha=0.7)
            ax4.text(x, y, text, ha='center', va='center', 
                    transform=ax4.transAxes, bbox=bbox, fontsize=10)
        
        # 绘制箭头
        arrows = [
            ((0.15, 0.7), (0.25, 0.7)),  # 数据流 -> 特征工程
            ((0.35, 0.7), (0.45, 0.7)),  # 特征工程 -> 模型预测
            ((0.55, 0.7), (0.65, 0.7)),  # 模型预测 -> 性能监控
            ((0.75, 0.7), (0.85, 0.7)),  # 性能监控 -> 结果输出
            ((0.7, 0.65), (0.55, 0.45)), # 性能监控 -> 概念漂移检测
            ((0.5, 0.35), (0.5, 0.15)),  # 概念漂移检测 -> 模型重训练
            ((0.45, 0.1), (0.45, 0.65))  # 模型重训练 -> 模型预测 (反馈)
        ]
        
        for start, end in arrows:
            ax4.annotate('', xy=end, xytext=start, 
                        xycoords='axes fraction', textcoords='axes fraction',
                        arrowprops=dict(arrowstyle='->', color='black', lw=1.5))
        
        ax4.set_xlim(0, 1)
        ax4.set_ylim(0, 1)
        ax4.axis('off')
        
        plt.tight_layout()
        plt.show()
        
        # 输出分析报告
        print("\n" + "="*60)
        print("实时机器学习分析报告")
        print("="*60)
        
        print(f"\n1. 概念漂移分析:")
        critical_drift = next((item for item in concept_drift_results 
                              if item['performance_drop'] > 0.1), None)
        if critical_drift:
            print(f"   严重漂移阈值: 漂移程度 {critical_drift['drift_level']:.1f}")
            print(f"   对应性能下降: {critical_drift['performance_drop']:.3f}")
        else:
            print(f"   在测试范围内未发现严重概念漂移")
        
        print(f"\n2. 模型稳定性评估:")
        stable_performance = [item for item in concept_drift_results 
                             if item['performance_drop'] < 0.05]
        print(f"   稳定性能区间: 漂移程度 0.0 - {max([item['drift_level'] for item in stable_performance]):.1f}")
        print(f"   建议监控频率: 每小时检查一次性能指标")
        
        print(f"\n3. 自适应学习建议:")
        print(f"   性能监控窗口: 5个批次")
        print(f"   重训练触发阈值: 性能下降 > 3%")
        print(f"   推荐更新策略: 增量学习 + 定期全量重训练")
        
        print(f"\n4. 实施建议:")
        print(f"   - 建立实时性能监控仪表板")
        print(f"   - 设置自动化的概念漂移检测")
        print(f"   - 实现模型版本管理和回滚机制")
        print(f"   - 定期评估和调整检测阈值")

# 演示代码
if __name__ == "__main__":
    # 创建Spark会话
    spark = SparkSession.builder \
        .appName("StreamingMLDemo") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .getOrCreate()
    
    try:
        print("=== Spark 实时机器学习演示 ===")
        
        # 创建演示实例
        demo = StreamingMLDemo(spark)
        
        # 1. 训练基础模型
        print("\n1. 训练基础模型...")
        model, assembler = demo.train_base_model()
        
        # 2. 模型监控
        print("\n2. 模型性能监控...")
        monitoring_results = demo.model_monitoring(model, assembler)
        
        # 3. 概念漂移检测
        print("\n3. 概念漂移检测...")
        drift_results = demo.concept_drift_detection(model, assembler)
        
        # 4. 自适应学习
        print("\n4. 自适应学习演示...")
        adapted_model = demo.adaptive_learning()
        
        # 5. 可视化结果
        print("\n5. 可视化分析结果...")
        demo.visualize_streaming_results([], drift_results)
        
        # 6. 流式预测演示(注释掉以避免长时间运行)
        # print("\n6. 启动流式预测...")
        # query = demo.streaming_prediction(model, assembler)
        # query.awaitTermination(timeout=30)  # 运行30秒
        # query.stop()
        
        print("\n=== 实时机器学习演示完成 ===")
        
    except Exception as e:
        print(f"演示过程中出现错误: {str(e)}")
        import traceback
        traceback.print_exc()
    
    finally:
        spark.stop()

实时机器学习总结

核心技术

  1. 流式数据处理

    • Spark Structured Streaming
    • 实时特征工程
    • 增量数据处理
  2. 在线学习

    • 模型增量更新
    • 自适应学习算法
    • 概念漂移处理
  3. 性能监控

    • 实时性能指标计算
    • 异常检测和告警
    • 模型质量评估

挑战与解决方案

概念漂移 - 检测方法: 统计检验、性能监控、分布比较 - 应对策略: 模型重训练、集成学习、在线更新

实时性要求 - 延迟优化: 模型简化、特征预计算、缓存策略 - 吞吐量提升: 并行处理、批处理优化

资源管理 - 内存优化: 流式窗口管理、数据分区 - 计算资源: 动态扩缩容、负载均衡

业务应用场景

  1. 金融风控: 实时欺诈检测、信用评估
  2. 推荐系统: 实时个性化推荐、用户行为分析
  3. IoT监控: 设备异常检测、预测性维护
  4. 广告投放: 实时竞价、效果优化

10. 总结与最佳实践

10.1 Spark MLlib 最佳实践

数据准备

  1. 数据质量检查

    • 缺失值处理
    • 异常值检测
    • 数据类型转换
  2. 特征工程

    • 特征选择和降维
    • 特征缩放和标准化
    • 类别特征编码
  3. 数据分区

    • 合理的分区策略
    • 避免数据倾斜
    • 缓存常用数据集

模型选择

  1. 算法选择原则

    • 根据问题类型选择算法
    • 考虑数据规模和特征维度
    • 平衡性能和可解释性
  2. 超参数调优

    • 使用交叉验证
    • 网格搜索和随机搜索
    • 贝叶斯优化
  3. 模型评估

    • 多种评估指标
    • 交叉验证
    • 业务指标对齐

性能优化

  1. Spark配置优化

    • 内存分配
    • 并行度设置
    • 序列化优化
  2. 算法优化

    • 选择合适的算法实现
    • 利用Spark的分布式特性
    • 避免数据shuffle
  3. 资源管理

    • 集群资源规划
    • 动态资源分配
    • 监控和调优

10.2 生产环境部署

模型部署策略

  1. 批处理部署

    • 定期模型更新
    • 大规模批量预测
    • 离线特征工程
  2. 实时部署

    • 流式预测服务
    • 低延迟要求
    • 在线特征计算
  3. 混合部署

    • 批流结合
    • 分层架构
    • 灵活切换

监控和维护

  1. 性能监控

    • 模型准确率监控
    • 预测延迟监控
    • 资源使用监控
  2. 数据监控

    • 数据质量检查
    • 特征分布监控
    • 概念漂移检测
  3. 模型更新

    • 自动化重训练
    • A/B测试
    • 版本管理

10.3 学习建议

技能发展路径

  1. 基础知识

    • Spark核心概念
    • 机器学习理论
    • 统计学基础
  2. 实践技能

    • 数据处理和分析
    • 模型开发和调优
    • 系统设计和优化
  3. 高级主题

    • 深度学习集成
    • 实时机器学习
    • MLOps实践

持续学习

  1. 跟踪技术发展

    • Spark新版本特性
    • 机器学习算法进展
    • 行业最佳实践
  2. 实践项目

    • 端到端项目经验
    • 不同领域应用
    • 开源贡献
  3. 社区参与

    • 技术会议和研讨会
    • 在线社区讨论
    • 知识分享

通过本教程的学习,你已经掌握了Spark MLlib的核心概念和实践技能。继续深入学习和实践,你将能够在大数据机器学习领域取得更大的成就! print(“\n1. 特征工程”)

    # 字符串索引化
    education_indexer = StringIndexer(
        inputCol="education",
        outputCol="education_index"
    )

    label_indexer = StringIndexer(
        inputCol="performance_level",
        outputCol="label"
    )

    # 组装特征向量
    assembler = VectorAssembler(
        inputCols=["age", "education_index", "experience", "performance_score", "rating"],
        outputCol="features"
    )

    # 应用转换
    df_indexed = education_indexer.fit(df).transform(df)
    df_labeled = label_indexer.fit(df_indexed).transform(df_indexed)
    df_features = assembler.transform(df_labeled)

    print("  处理后的数据:")
    df_features.select(
        "age", "education", "education_index", "experience", 
        "performance_score", "rating", "performance_level", "label", "features"
    ).show(truncate=False)

    # 2. 数据分割
    print("\n2. 数据分割")
    train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)

    print(f"  训练集大小: {train_data.count()}")
    print(f"  测试集大小: {test_data.count()}")

    # 3. 创建决策树模型
    print("\n3. 创建决策树模型")

    dt = DecisionTreeClassifier(
        featuresCol="features",
        labelCol="label",
        maxDepth=5,
        minInstancesPerNode=1,
        minInfoGain=0.0,
        impurity="gini"
    )

    # 4. 训练模型
    print("\n4. 训练模型")
    dt_model = dt.fit(train_data)

    print("  模型训练完成")
    print(f"  树的深度: {dt_model.depth}")
    print(f"  节点数量: {dt_model.numNodes}")

    # 5. 特征重要性
    print("\n5. 特征重要性")
    feature_names = ["age", "education_index", "experience", "performance_score", "rating"]
    importances = dt_model.featureImportances.toArray()

    print("  特征重要性排序:")
    for name, importance in zip(feature_names, importances):
        print(f"    {name}: {importance:.4f}")

    # 6. 模型预测
    print("\n6. 模型预测")
    predictions = dt_model.transform(test_data)

    print("  预测结果:")
    predictions.select(
        "age", "education", "experience", "performance_score", 
        "performance_level", "label", "prediction"
    ).show()

    # 7. 模型评估
    print("\n7. 模型评估")

    evaluator = MulticlassClassificationEvaluator(
        labelCol="label",
        predictionCol="prediction"
    )

    accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
    precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
    recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
    f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

    print(f"  准确率: {accuracy:.4f}")
    print(f"  加权精确率: {precision:.4f}")
    print(f"  加权召回率: {recall:.4f}")
    print(f"  加权F1分数: {f1:.4f}")

    # 8. 混淆矩阵
    print("\n8. 混淆矩阵")
    confusion_matrix = predictions.groupBy("label", "prediction").count().orderBy("label", "prediction")
    confusion_matrix.show()

    return {
        'model': dt_model,
        'predictions': predictions,
        'metrics': {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        },
        'feature_importances': dict(zip(feature_names, importances))
    }

def hyperparameter_tuning(self, df):
    """
    决策树超参数调优
    """
    print("\n=== 决策树超参数调优 ===")

    from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
    from pyspark.ml import Pipeline

    # 1. 创建Pipeline
    education_indexer = StringIndexer(inputCol="education", outputCol="education_index")
    label_indexer = StringIndexer(inputCol="performance_level", outputCol="label")
    assembler = VectorAssembler(
        inputCols=["age", "education_index", "experience", "performance_score", "rating"],
        outputCol="features"
    )
    dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")

    pipeline = Pipeline(stages=[education_indexer, label_indexer, assembler, dt])

    # 2. 参数网格
    param_grid = ParamGridBuilder() \
        .addGrid(dt.maxDepth, [3, 5, 7]) \
        .addGrid(dt.minInstancesPerNode, [1, 5, 10]) \
        .addGrid(dt.impurity, ["gini", "entropy"]) \
        .build()

    print(f"  参数组合数量: {len(param_grid)}")

    # 3. 交叉验证
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="accuracy"
    )

    cv = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=3,
        seed=42
    )

    # 4. 训练和选择最佳模型
    print("\n  开始交叉验证...")
    cv_model = cv.fit(df)

    # 5. 最佳参数
    best_pipeline = cv_model.bestModel
    best_dt = best_pipeline.stages[-1]  # 决策树是Pipeline的最后一个阶段

    print("\n  最佳参数:")
    print(f"    maxDepth: {best_dt.getMaxDepth()}")
    print(f"    minInstancesPerNode: {best_dt.getMinInstancesPerNode()}")
    print(f"    impurity: {best_dt.getImpurity()}")

    # 6. 交叉验证结果
    print("\n  交叉验证平均分数:")
    avg_metrics = cv_model.avgMetrics
    for i, score in enumerate(avg_metrics):
        print(f"    参数组合 {i+1}: {score:.4f}")

    best_score = max(avg_metrics)
    print(f"\n  最佳交叉验证分数: {best_score:.4f}")

    return {
        'best_model': best_pipeline,
        'cv_model': cv_model,
        'best_score': best_score,
        'avg_metrics': avg_metrics
    }

def train_decision_tree_regression(self, df):
    """
    训练决策树回归模型
    """
    print("\n=== 决策树回归模型训练 ===")

    # 1. 特征工程
    print("\n1. 特征工程")

    feature_cols = [
        "price", "advertising_budget", "season", "category", "competitors",
        "market_maturity", "brand_awareness", "quality_score",
        "promotion", "online_channel"
    ]

    assembler = VectorAssembler(
        inputCols=feature_cols,
        outputCol="features"
    )

    df_features = assembler.transform(df)

    print("  特征工程完成")
    print(f"  特征维度: {len(feature_cols)}")

    # 2. 数据分割
    print("\n2. 数据分割")
    train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)

    print(f"  训练集大小: {train_data.count()}")
    print(f"  测试集大小: {test_data.count()}")

    # 3. 创建决策树回归模型
    print("\n3. 创建决策树回归模型")

    dt_regressor = DecisionTreeRegressor(
        featuresCol="features",
        labelCol="sales",
        maxDepth=10,
        minInstancesPerNode=20,
        minInfoGain=0.01,
        seed=42
    )

    # 4. 训练模型
    print("\n4. 训练模型")
    dt_model = dt_regressor.fit(train_data)

    print("  模型训练完成")
    print(f"  树深度: {dt_model.depth}")
    print(f"  节点数量: {dt_model.numNodes}")

    # 5. 特征重要性
    print("\n5. 特征重要性")
    feature_importance = dt_model.featureImportances.toArray()

    print("  特征重要性排序:")
    importance_pairs = list(zip(feature_cols, feature_importance))
    importance_pairs.sort(key=lambda x: x[1], reverse=True)

    for feature, importance in importance_pairs:
        print(f"    {feature}: {importance:.4f}")

    # 6. 模型预测
    print("\n6. 模型预测")
    predictions = dt_model.transform(test_data)

    print("  预测结果样例:")
    predictions.select(
        "product_id", "price", "advertising_budget", "season",
        "sales", "prediction"
    ).show(10)

    # 7. 模型评估
    print("\n7. 模型评估")

    evaluator = RegressionEvaluator(
        labelCol="sales",
        predictionCol="prediction"
    )

    rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
    mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
    r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

    print(f"  RMSE (均方根误差): {rmse:.4f}")
    print(f"  MAE (平均绝对误差): {mae:.4f}")
    print(f"  R² (决定系数): {r2:.4f}")

    # 8. 训练集评估
    train_predictions = dt_model.transform(train_data)
    train_rmse = evaluator.evaluate(train_predictions, {evaluator.metricName: "rmse"})
    train_r2 = evaluator.evaluate(train_predictions, {evaluator.metricName: "r2"})

    print(f"\n  训练集性能:")
    print(f"    RMSE: {train_rmse:.4f}")
    print(f"    R²: {train_r2:.4f}")

    # 9. 过拟合分析
    print("\n8. 过拟合分析")
    overfitting_rmse = abs(train_rmse - rmse)
    overfitting_r2 = abs(train_r2 - r2)

    print(f"  RMSE差异: {overfitting_rmse:.4f}")
    print(f"  R²差异: {overfitting_r2:.4f}")

    if overfitting_rmse > rmse * 0.1:
        print("  ⚠️ 检测到过拟合,建议减少树深度或增加最小样本数")
    else:
        print("  ✅ 模型泛化性能良好")

    return {
        'model': dt_model,
        'predictions': predictions,
        'feature_cols': feature_cols,
        'metrics': {
            'rmse': rmse,
            'mae': mae,
            'r2': r2,
            'train_rmse': train_rmse,
            'train_r2': train_r2
        },
        'feature_importance': dict(zip(feature_cols, feature_importance)),
        'tree_info': {
            'depth': dt_model.depth,
            'num_nodes': dt_model.numNodes
        }
    }

def hyperparameter_tuning_regression(self, df):
    """
    决策树回归超参数调优
    """
    print("\n=== 决策树回归超参数调优 ===")

    # 特征工程
    feature_cols = [
        "price", "advertising_budget", "season", "category", "competitors",
        "market_maturity", "brand_awareness", "quality_score",
        "promotion", "online_channel"
    ]

    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

    # 创建Pipeline
    dt_regressor = DecisionTreeRegressor(
        featuresCol="features",
        labelCol="sales",
        seed=42
    )

    pipeline = Pipeline(stages=[assembler, dt_regressor])

    # 参数网格
    param_grid = ParamGridBuilder() \
        .addGrid(dt_regressor.maxDepth, [5, 10, 15, 20]) \
        .addGrid(dt_regressor.minInstancesPerNode, [10, 20, 50]) \
        .addGrid(dt_regressor.minInfoGain, [0.0, 0.01, 0.05]) \
        .build()

    print(f"  参数组合数量: {len(param_grid)}")

    # 交叉验证
    evaluator = RegressionEvaluator(
        labelCol="sales",
        predictionCol="prediction",
        metricName="rmse"
    )

    cv = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=5,
        seed=42
    )

    # 训练和评估
    print("\n  开始交叉验证...")
    cv_model = cv.fit(df)

    # 最佳模型
    best_model = cv_model.bestModel
    best_dt_model = best_model.stages[1]

    print("\n  最佳参数:")
    print(f"    最大深度: {best_dt_model.getMaxDepth()}")
    print(f"    最小实例数: {best_dt_model.getMinInstancesPerNode()}")
    print(f"    最小信息增益: {best_dt_model.getMinInfoGain()}")

    # 评估最佳模型
    test_data = df
    predictions = cv_model.transform(test_data)

    rmse = evaluator.evaluate(predictions)

    evaluator_r2 = RegressionEvaluator(
        labelCol="sales",
        predictionCol="prediction",
        metricName="r2"
    )
    r2 = evaluator_r2.evaluate(predictions)

    print(f"\n  最佳模型性能:")
    print(f"    RMSE: {rmse:.4f}")
    print(f"    R²: {r2:.4f}")
    print(f"    树深度: {best_dt_model.depth}")
    print(f"    节点数量: {best_dt_model.numNodes}")

    return {
        'best_model': cv_model,
        'best_params': {
            'maxDepth': best_dt_model.getMaxDepth(),
            'minInstancesPerNode': best_dt_model.getMinInstancesPerNode(),
            'minInfoGain': best_dt_model.getMinInfoGain()
        },
        'best_metrics': {
            'rmse': rmse,
            'r2': r2
        },
        'cv_results': cv_model.avgMetrics
    }

def compare_tree_depths_regression(self, df):
    """
    比较不同树深度的性能和过拟合情况
    """
    print("\n=== 比较不同树深度 ===")

    # 特征工程
    feature_cols = [
        "price", "advertising_budget", "season", "category", "competitors",
        "market_maturity", "brand_awareness", "quality_score",
        "promotion", "online_channel"
    ]

    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    df_features = assembler.transform(df)

    train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)

    # 不同的树深度
    max_depths = [3, 5, 7, 10, 15, 20, 25]
    results = []

    evaluator = RegressionEvaluator(
        labelCol="sales",
        predictionCol="prediction"
    )

    for depth in max_depths:
        print(f"\n  测试树深度: {depth}")

        # 创建决策树
        dt = DecisionTreeRegressor(
            featuresCol="features",
            labelCol="sales",
            maxDepth=depth,
            minInstancesPerNode=20,
            seed=42
        )

        # 训练模型
        model = dt.fit(train_data)

        # 预测
        train_pred = model.transform(train_data)
        test_pred = model.transform(test_data)

        # 评估
        train_rmse = evaluator.evaluate(train_pred, {evaluator.metricName: "rmse"})
        test_rmse = evaluator.evaluate(test_pred, {evaluator.metricName: "rmse"})
        train_r2 = evaluator.evaluate(train_pred, {evaluator.metricName: "r2"})
        test_r2 = evaluator.evaluate(test_pred, {evaluator.metricName: "r2"})

        result = {
            'max_depth': depth,
            'train_rmse': train_rmse,
            'test_rmse': test_rmse,
            'train_r2': train_r2,
            'test_r2': test_r2,
            'overfitting_rmse': abs(train_rmse - test_rmse),
            'overfitting_r2': abs(train_r2 - test_r2),
            'num_nodes': model.numNodes
        }

        results.append(result)

        print(f"    训练RMSE: {train_rmse:.4f}, 测试RMSE: {test_rmse:.4f}")
        print(f"    训练R²: {train_r2:.4f}, 测试R²: {test_r2:.4f}")
        print(f"    过拟合程度(RMSE): {abs(train_rmse - test_rmse):.4f}")
        print(f"    节点数量: {model.numNodes}")

    return results

def visualize_dt_regression_results(self, model_results, comparison_results=None, tuning_results=None):
    """
    可视化决策树回归结果
    """
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np
    import pandas as pd

    # 设置中文字体
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False

    # 创建图形
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('决策树回归模型分析结果', fontsize=16, fontweight='bold')

    predictions = model_results['predictions']

    # 1. 预测值 vs 实际值
    pred_actual = predictions.select("prediction", "sales").toPandas()
    axes[0, 0].scatter(pred_actual['sales'], pred_actual['prediction'], alpha=0.6)
    axes[0, 0].plot([pred_actual['sales'].min(), pred_actual['sales'].max()], 
                   [pred_actual['sales'].min(), pred_actual['sales'].max()], 'r--', lw=2)
    axes[0, 0].set_xlabel('实际销售额')
    axes[0, 0].set_ylabel('预测销售额')
    axes[0, 0].set_title('预测值 vs 实际值')
    axes[0, 0].grid(True, alpha=0.3)

    # 2. 残差分析
    residuals = pred_actual['sales'] - pred_actual['prediction']
    axes[0, 1].scatter(pred_actual['prediction'], residuals, alpha=0.6)
    axes[0, 1].axhline(y=0, color='r', linestyle='--')
    axes[0, 1].set_xlabel('预测销售额')
    axes[0, 1].set_ylabel('残差')
    axes[0, 1].set_title('残差分析')
    axes[0, 1].grid(True, alpha=0.3)

    # 3. 特征重要性
    feature_names = list(model_results['feature_importance'].keys())
    importance_values = list(model_results['feature_importance'].values())

    # 按重要性排序
    sorted_idx = np.argsort(importance_values)[::-1]
    sorted_features = [feature_names[i] for i in sorted_idx]
    sorted_importance = [importance_values[i] for i in sorted_idx]

    bars = axes[0, 2].bar(range(len(sorted_features)), sorted_importance, alpha=0.7)
    axes[0, 2].set_xticks(range(len(sorted_features)))
    axes[0, 2].set_xticklabels(sorted_features, rotation=45)
    axes[0, 2].set_ylabel('重要性')
    axes[0, 2].set_title('特征重要性')
    axes[0, 2].grid(True, alpha=0.3)

    # 4. 树深度比较(如果有比较结果)
    if comparison_results:
        depths = [r['max_depth'] for r in comparison_results]
        train_rmse = [r['train_rmse'] for r in comparison_results]
        test_rmse = [r['test_rmse'] for r in comparison_results]

        axes[1, 0].plot(depths, train_rmse, 'bo-', label='训练RMSE', linewidth=2)
        axes[1, 0].plot(depths, test_rmse, 'ro-', label='测试RMSE', linewidth=2)
        axes[1, 0].set_xlabel('树深度')
        axes[1, 0].set_ylabel('RMSE')
        axes[1, 0].set_title('树深度 vs RMSE')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)

    # 5. 过拟合分析
    if comparison_results:
        depths = [r['max_depth'] for r in comparison_results]
        overfitting = [r['overfitting_rmse'] for r in comparison_results]

        axes[1, 1].plot(depths, overfitting, 'go-', linewidth=2, markersize=6)
        axes[1, 1].set_xlabel('树深度')
        axes[1, 1].set_ylabel('过拟合程度 (RMSE差异)')
        axes[1, 1].set_title('过拟合分析')
        axes[1, 1].grid(True, alpha=0.3)

    # 6. 模型性能指标
    metrics = model_results['metrics']
    metric_names = ['RMSE', 'MAE', 'R²']
    metric_values = [metrics['rmse'], metrics['mae'], metrics['r2']]
    colors = ['red', 'orange', 'green']

    bars = axes[1, 2].bar(metric_names, metric_values, color=colors, alpha=0.7)
    axes[1, 2].set_ylabel('指标值')
    axes[1, 2].set_title('模型性能指标')
    axes[1, 2].grid(True, alpha=0.3)

    # 添加数值标签
    for bar, value in zip(bars, metric_values):
        height = bar.get_height()
        axes[1, 2].text(bar.get_x() + bar.get_width()/2., height,
                       f'{value:.4f}', ha='center', va='bottom')

    plt.tight_layout()
    plt.show()

    # 输出详细分析
    print("\n=== 决策树回归模型分析报告 ===")
    print(f"模型性能指标:")
    print(f"  - RMSE: {metrics['rmse']:.4f}")
    print(f"  - MAE: {metrics['mae']:.4f}")
    print(f"  - R²: {metrics['r2']:.4f}")

    print(f"\n树结构信息:")
    print(f"  - 树深度: {model_results['tree_info']['depth']}")
    print(f"  - 节点数量: {model_results['tree_info']['num_nodes']}")

    print(f"\n过拟合分析:")
    print(f"  - 训练集R²: {metrics['train_r2']:.4f}")
    print(f"  - 测试集R²: {metrics['r2']:.4f}")
    print(f"  - 过拟合程度: {abs(metrics['train_r2'] - metrics['r2']):.4f}")

    # 特征重要性分析
    print(f"\n特征重要性分析:")
    top_features = sorted(model_results['feature_importance'].items(), 
                        key=lambda x: x[1], reverse=True)[:5]
    for feature, importance in top_features:
        print(f"  - {feature}: {importance:.4f}")

    if comparison_results:
        best_depth_result = min(comparison_results, key=lambda x: x['test_rmse'])
        print(f"\n最佳树深度分析:")
        print(f"  - 最佳深度: {best_depth_result['max_depth']}")
        print(f"  - 最佳测试RMSE: {best_depth_result['test_rmse']:.4f}")
        print(f"  - 对应节点数: {best_depth_result['num_nodes']}")

    return {
        'residuals_mean': residuals.mean(),
        'residuals_std': residuals.std(),
        'overfitting_degree': abs(metrics['train_r2'] - metrics['r2'])
    }

演示代码

if name == “main”: # 创建演示实例 demo = DecisionTreeRegressionDemo()

# 1. 创建数据集
print("=== 创建销售预测数据集 ===")
df = demo.create_dataset(n_samples=2000)

# 2. 训练决策树回归模型
model_results = demo.train_decision_tree_regression(df)

# 3. 超参数调优
tuning_results = demo.hyperparameter_tuning_regression(df)

# 4. 比较不同树深度
comparison_results = demo.compare_tree_depths_regression(df)

# 5. 可视化结果
demo.visualize_dt_regression_results(model_results, comparison_results, tuning_results)

print("\n=== 决策树回归演示完成 ===")

**总结:**

决策树回归具有以下特点:

1. **优点**:
   - 能够捕捉非线性关系
   - 不需要特征标准化
   - 可解释性强
   - 能够处理数值和分类特征
   - 自动进行特征选择

2. **缺点**:
   - 容易过拟合
   - 对噪声敏感
   - 预测结果不连续
   - 单棵树的泛化能力有限

3. **适用场景**:
   - 数据存在非线性关系
   - 需要模型可解释性
   - 特征类型混合
   - 数据量适中

4. **业务建议**:
   - 销售预测中,广告预算、价格、季节性是关键因素
   - 建议控制树深度防止过拟合
   - 可结合随机森林提升性能

## 5. 聚类算法

聚类是无监督学习的重要分支,用于发现数据中的隐藏模式和结构。

### 5.1 K-Means聚类

K-Means是最常用的聚类算法,通过迭代优化将数据分为K个簇。

```python
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np

class KMeansDemo:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("KMeansDemo") \
            .getOrCreate()
    
    def create_dataset(self, n_samples=3000):
        """
        创建客户细分数据集
        """
        print(f"\n创建客户细分数据集 (样本数: {n_samples})")
        
        np.random.seed(42)
        
        # 生成客户ID
        customer_ids = [f"CUST_{i:05d}" for i in range(1, n_samples + 1)]
        
        # 客户特征
        ages = np.random.normal(40, 15, n_samples)
        ages = np.clip(ages, 18, 80).astype(int)
        
        # 年收入(万元)
        incomes = np.random.lognormal(3.5, 0.6, n_samples)
        incomes = np.clip(incomes, 20, 200)
        
        # 消费金额(年度,万元)
        spending_scores = np.random.normal(50, 25, n_samples)
        spending_scores = np.clip(spending_scores, 1, 100)
        
        # 购买频次(年度)
        purchase_frequency = np.random.poisson(12, n_samples)
        purchase_frequency = np.clip(purchase_frequency, 1, 50)
        
        # 客户忠诚度评分(1-10)
        loyalty_scores = np.random.normal(6, 2, n_samples)
        loyalty_scores = np.clip(loyalty_scores, 1, 10)
        
        # 在线购买比例(0-1)
        online_ratios = np.random.beta(2, 2, n_samples)
        
        # 客户生命周期(月)
        customer_lifetime = np.random.exponential(24, n_samples)
        customer_lifetime = np.clip(customer_lifetime, 1, 120)
        
        # 平均订单价值
        avg_order_values = spending_scores * 10000 / purchase_frequency
        avg_order_values = np.clip(avg_order_values, 50, 5000)
        
        # 退货率
        return_rates = np.random.beta(1, 9, n_samples)  # 大多数客户退货率较低
        
        # 推荐接受率
        recommendation_acceptance = np.random.beta(3, 7, n_samples)
        
        # 创建DataFrame
        data = [
            (
                customer_ids[i], int(ages[i]), float(incomes[i]), float(spending_scores[i]),
                int(purchase_frequency[i]), float(loyalty_scores[i]), float(online_ratios[i]),
                float(customer_lifetime[i]), float(avg_order_values[i]), float(return_rates[i]),
                float(recommendation_acceptance[i])
            )
            for i in range(n_samples)
        ]
        
        schema = StructType([
            StructField("customer_id", StringType(), True),
            StructField("age", IntegerType(), True),
            StructField("income", DoubleType(), True),
            StructField("spending_score", DoubleType(), True),
            StructField("purchase_frequency", IntegerType(), True),
            StructField("loyalty_score", DoubleType(), True),
            StructField("online_ratio", DoubleType(), True),
            StructField("customer_lifetime", DoubleType(), True),
            StructField("avg_order_value", DoubleType(), True),
            StructField("return_rate", DoubleType(), True),
            StructField("recommendation_acceptance", DoubleType(), True)
        ])
        
        self.df = self.spark.createDataFrame(data, schema)
        
        print(f"  生成数据集大小: {self.df.count()} 行, {len(self.df.columns)} 列")
        
        # 显示数据概览
        print("\n  数据概览:")
        self.df.show(10)
        
        # 显示数据统计
        print("\n  数据统计:")
        self.df.describe().show()
        
        return self.df
    
    def visualize_results(self, model_results, tuning_results=None):
        """可视化线性回归结果"""
        import matplotlib.pyplot as plt
        import seaborn as sns
        import numpy as np
        import pandas as pd
        
        # 设置中文字体
        plt.rcParams['font.sans-serif'] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 创建图形
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('线性回归模型分析结果', fontsize=16, fontweight='bold')
        
        predictions = model_results['predictions']
        
        # 1. 预测值 vs 实际值
        pred_actual = predictions.select("prediction", "price").toPandas()
        axes[0, 0].scatter(pred_actual['price'], pred_actual['prediction'], alpha=0.6)
        axes[0, 0].plot([pred_actual['price'].min(), pred_actual['price'].max()], 
                       [pred_actual['price'].min(), pred_actual['price'].max()], 'r--', lw=2)
        axes[0, 0].set_xlabel('实际价格')
        axes[0, 0].set_ylabel('预测价格')
        axes[0, 0].set_title('预测值 vs 实际值')
        axes[0, 0].grid(True, alpha=0.3)
        
        # 2. 残差分析
        residuals = pred_actual['price'] - pred_actual['prediction']
        axes[0, 1].scatter(pred_actual['prediction'], residuals, alpha=0.6)
        axes[0, 1].axhline(y=0, color='r', linestyle='--')
        axes[0, 1].set_xlabel('预测价格')
        axes[0, 1].set_ylabel('残差')
        axes[0, 1].set_title('残差分析')
        axes[0, 1].grid(True, alpha=0.3)
        
        # 3. 残差分布
        axes[0, 2].hist(residuals, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
        axes[0, 2].axvline(residuals.mean(), color='red', linestyle='--', 
                          label=f'均值: {residuals.mean():.2f}')
        axes[0, 2].set_xlabel('残差')
        axes[0, 2].set_ylabel('频次')
        axes[0, 2].set_title('残差分布')
        axes[0, 2].legend()
        axes[0, 2].grid(True, alpha=0.3)
        
        # 4. 特征重要性(系数)
        feature_names = list(model_results['coefficients'].keys())
        coefficients = list(model_results['coefficients'].values())
        
        # 按绝对值排序
        sorted_idx = np.argsort(np.abs(coefficients))[::-1]
        sorted_features = [feature_names[i] for i in sorted_idx]
        sorted_coefs = [coefficients[i] for i in sorted_idx]
        
        colors = ['red' if x < 0 else 'blue' for x in sorted_coefs]
        bars = axes[1, 0].bar(range(len(sorted_features)), sorted_coefs, color=colors, alpha=0.7)
        axes[1, 0].set_xticks(range(len(sorted_features)))
        axes[1, 0].set_xticklabels(sorted_features, rotation=45)
        axes[1, 0].set_ylabel('系数值')
        axes[1, 0].set_title('特征系数(重要性)')
        axes[1, 0].axhline(y=0, color='black', linestyle='-', alpha=0.3)
        axes[1, 0].grid(True, alpha=0.3)
        
        # 添加数值标签
        for bar, coef in zip(bars, sorted_coefs):
            height = bar.get_height()
            axes[1, 0].text(bar.get_x() + bar.get_width()/2., height,
                           f'{coef:.1f}', ha='center', va='bottom' if height > 0 else 'top')
        
        # 5. 超参数调优结果
        if tuning_results:
            avg_metrics = tuning_results['avg_metrics']
            param_indices = list(range(1, len(avg_metrics) + 1))
            
            axes[1, 1].plot(param_indices, avg_metrics, 'bo-', linewidth=2, markersize=6)
            axes[1, 1].axhline(y=min(avg_metrics), color='red', linestyle='--', 
                              label=f'最佳RMSE: {min(avg_metrics):.2f}')
            axes[1, 1].set_xlabel('参数组合')
            axes[1, 1].set_ylabel('交叉验证RMSE')
            axes[1, 1].set_title('超参数调优结果')
            axes[1, 1].legend()
            axes[1, 1].grid(True, alpha=0.3)
        
        # 6. 模型性能指标
        metrics = model_results['metrics']
        metric_names = ['RMSE', 'MAE', 'R²']
        metric_values = [metrics['rmse'], metrics['mae'], metrics['r2']]
        colors = ['red', 'orange', 'green']
        
        bars = axes[1, 2].bar(metric_names, metric_values, color=colors, alpha=0.7)
        axes[1, 2].set_ylabel('指标值')
        axes[1, 2].set_title('模型性能指标')
        axes[1, 2].grid(True, alpha=0.3)
        
        # 添加数值标签
        for bar, value in zip(bars, metric_values):
            height = bar.get_height()
            axes[1, 2].text(bar.get_x() + bar.get_width()/2., height,
                           f'{value:.4f}', ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()
        
        # 输出详细分析
        print("\n=== 线性回归模型分析报告 ===")
        print(f"模型性能指标:")
        print(f"  - RMSE: {metrics['rmse']:.2f}")
        print(f"  - MAE: {metrics['mae']:.2f}")
        print(f"  - R²: {metrics['r2']:.4f}")
        
        print(f"\n残差分析:")
        print(f"  - 残差均值: {residuals.mean():.2f}")
        print(f"  - 残差标准差: {residuals.std():.2f}")
        print(f"  - 残差范围: [{residuals.min():.2f}, {residuals.max():.2f}]")
        
        print(f"\n特征重要性分析:")
        for name, coef in zip(sorted_features[:5], sorted_coefs[:5]):
            print(f"  - {name}: {coef:.2f}")
        
        return {
            'residuals_mean': residuals.mean(),
            'residuals_std': residuals.std(),
            'top_features': list(zip(sorted_features[:5], sorted_coefs[:5]))
        }

# 演示代码
if __name__ == "__main__":
    # 创建演示实例
    demo = LinearRegressionDemo()
    
    # 1. 创建数据集
    print("=== 创建房价预测数据集 ===")
    df = demo.create_dataset(n_samples=2000)
    
    # 2. 训练线性回归模型
    model_results = demo.train_linear_regression(df)
    
    # 3. 超参数调优
    tuning_results = demo.hyperparameter_tuning(df)
    
    # 4. 可视化结果
    demo.visualize_results(model_results, tuning_results)
    
    print("\n=== 线性回归演示完成 ===")

总结:

线性回归是最基础但非常重要的回归算法,具有以下特点:

  1. 优点

    • 模型简单,易于理解和解释
    • 训练速度快,计算效率高
    • 不容易过拟合
    • 特征系数直接反映特征重要性
  2. 缺点

    • 假设特征与目标变量线性相关
    • 对异常值敏感
    • 无法捕捉非线性关系
  3. 适用场景

    • 特征与目标变量线性相关
    • 需要模型可解释性
    • 数据量较大,需要快速训练
    • 作为基线模型进行比较
  4. 业务建议

    • 房价预测中,面积、地理位置、学区是最重要的因素
    • 建议重点关注这些高影响因素
    • 可以考虑特征工程来提升模型性能

4.2 岭回归(Ridge Regression)

岭回归是线性回归的正则化版本,通过添加L2正则化项来防止过拟合。

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np

class RidgeRegressionDemo:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("RidgeRegressionDemo") \
            .getOrCreate()
    
    def create_dataset(self, n_samples=1500):
        """
        创建股票价格预测数据集
        """
        print(f"\n创建股票价格预测数据集 (样本数: {n_samples})")
        
        np.random.seed(42)
        
        # 生成股票ID
        stock_ids = [f"STOCK_{i:04d}" for i in range(1, n_samples + 1)]
        
        # 基础财务指标
        pe_ratios = np.random.lognormal(2.5, 0.8, n_samples)  # 市盈率
        pb_ratios = np.random.lognormal(0.8, 0.6, n_samples)  # 市净率
        roe = np.random.normal(12, 8, n_samples)  # 净资产收益率
        roe = np.clip(roe, -20, 50)
        
        # 市场指标
        market_caps = np.random.lognormal(8, 1.5, n_samples)  # 市值(亿元)
        volumes = np.random.lognormal(15, 1.2, n_samples)  # 成交量
        volatilities = np.random.gamma(2, 0.1, n_samples)  # 波动率
        
        # 技术指标
        rsi = np.random.normal(50, 20, n_samples)  # RSI指标
        rsi = np.clip(rsi, 0, 100)
        
        macd = np.random.normal(0, 2, n_samples)  # MACD指标
        
        # 宏观经济指标
        gdp_growth = np.random.normal(6.5, 1.5, n_samples)  # GDP增长率
        inflation = np.random.normal(2.5, 1, n_samples)  # 通胀率
        interest_rate = np.random.normal(4, 1, n_samples)  # 利率
        
        # 行业分类(1-10个行业)
        industries = np.random.randint(1, 11, n_samples)
        
        # 生成股票价格(基于多个因素的复杂关系)
        base_price = (
            np.log(market_caps) * 5 +  # 市值影响
            roe * 0.8 +  # ROE正向影响
            gdp_growth * 2 +  # GDP增长正向影响
            -pe_ratios * 0.3 +  # 高PE负向影响
            -pb_ratios * 0.5 +  # 高PB负向影响
            -volatilities * 10 +  # 高波动负向影响
            -inflation * 1.5 +  # 通胀负向影响
            (rsi - 50) * 0.1 +  # RSI偏离50的影响
            macd * 0.5  # MACD影响
        )
        
        # 添加行业效应
        industry_effects = np.random.normal(0, 3, 10)
        for i in range(n_samples):
            base_price[i] += industry_effects[industries[i] - 1]
        
        # 添加随机噪声和非线性效应
        noise = np.random.normal(0, 5, n_samples)
        nonlinear_effect = np.sin(pe_ratios / 10) * 2  # 非线性效应
        
        prices = base_price + noise + nonlinear_effect
        prices = np.maximum(prices, 5)  # 最低价格5元
        
        # 创建DataFrame
        data = [
            (
                stock_ids[i], float(pe_ratios[i]), float(pb_ratios[i]), float(roe[i]),
                float(market_caps[i]), float(volumes[i]), float(volatilities[i]),
                float(rsi[i]), float(macd[i]), float(gdp_growth[i]),
                float(inflation[i]), float(interest_rate[i]), int(industries[i]), float(prices[i])
            )
            for i in range(n_samples)
        ]
        
        schema = StructType([
            StructField("stock_id", StringType(), True),
            StructField("pe_ratio", DoubleType(), True),
            StructField("pb_ratio", DoubleType(), True),
            StructField("roe", DoubleType(), True),
            StructField("market_cap", DoubleType(), True),
            StructField("volume", DoubleType(), True),
            StructField("volatility", DoubleType(), True),
            StructField("rsi", DoubleType(), True),
            StructField("macd", DoubleType(), True),
            StructField("gdp_growth", DoubleType(), True),
            StructField("inflation", DoubleType(), True),
            StructField("interest_rate", DoubleType(), True),
            StructField("industry", IntegerType(), True),
            StructField("price", DoubleType(), True)
        ])
        
        self.df = self.spark.createDataFrame(data, schema)
        
        print(f"  生成数据集大小: {self.df.count()} 行, {len(self.df.columns)} 列")
        
        # 显示数据概览
        print("\n  数据概览:")
        self.df.show(10)
        
        # 显示数据统计
        print("\n  数据统计:")
        self.df.describe().show()
        
        return self.df
    
    def train_ridge_regression(self, df):
        """
        训练岭回归模型
        """
        print("\n=== 岭回归模型训练 ===")
        
        # 1. 特征工程
        print("\n1. 特征工程")
        
        feature_cols = [
            "pe_ratio", "pb_ratio", "roe", "market_cap", "volume",
            "volatility", "rsi", "macd", "gdp_growth", "inflation",
            "interest_rate", "industry"
        ]
        
        assembler = VectorAssembler(
            inputCols=feature_cols,
            outputCol="raw_features"
        )
        
        # 特征标准化(对岭回归很重要)
        scaler = StandardScaler(
            inputCol="raw_features",
            outputCol="features",
            withStd=True,
            withMean=True
        )
        
        # 应用转换
        df_assembled = assembler.transform(df)
        scaler_model = scaler.fit(df_assembled)
        df_features = scaler_model.transform(df_assembled)
        
        print("  特征工程完成")
        print(f"  特征维度: {len(feature_cols)}")
        
        # 2. 数据分割
        print("\n2. 数据分割")
        train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)
        
        print(f"  训练集大小: {train_data.count()}")
        print(f"  测试集大小: {test_data.count()}")
        
        # 3. 创建岭回归模型(使用L2正则化)
        print("\n3. 创建岭回归模型")
        
        ridge = LinearRegression(
            featuresCol="features",
            labelCol="price",
            regParam=1.0,  # L2正则化参数(较大值)
            elasticNetParam=0.0,  # 纯L2正则化
            maxIter=100,
            tol=1e-6
        )
        
        # 4. 训练模型
        print("\n4. 训练模型")
        ridge_model = ridge.fit(train_data)
        
        print("  模型训练完成")
        print(f"  迭代次数: {ridge_model.summary.totalIterations}")
        print(f"  目标函数值: {ridge_model.summary.objectiveHistory[-1]:.6f}")
        
        # 5. 模型系数
        print("\n5. 模型系数")
        coefficients = ridge_model.coefficients.toArray()
        intercept = ridge_model.intercept
        
        print(f"  截距: {intercept:.4f}")
        print("  特征系数:")
        for feature, coef in zip(feature_cols, coefficients):
            print(f"    {feature}: {coef:.4f}")
        
        # 6. 模型预测
        print("\n6. 模型预测")
        predictions = ridge_model.transform(test_data)
        
        print("  预测结果样例:")
        predictions.select(
            "stock_id", "pe_ratio", "roe", "market_cap",
            "price", "prediction"
        ).show(10)
        
        # 7. 模型评估
        print("\n7. 模型评估")
        
        evaluator = RegressionEvaluator(
            labelCol="price",
            predictionCol="prediction"
        )
        
        rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
        mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
        r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
        
        print(f"  RMSE (均方根误差): {rmse:.4f}")
        print(f"  MAE (平均绝对误差): {mae:.4f}")
        print(f"  R² (决定系数): {r2:.4f}")
        
        # 8. 训练集评估
        train_predictions = ridge_model.transform(train_data)
        train_rmse = evaluator.evaluate(train_predictions, {evaluator.metricName: "rmse"})
        train_r2 = evaluator.evaluate(train_predictions, {evaluator.metricName: "r2"})
        
        print(f"\n  训练集性能:")
        print(f"    RMSE: {train_rmse:.4f}")
        print(f"    R²: {train_r2:.4f}")
        
        # 9. 正则化效果分析
        print("\n8. 正则化效果分析")
        coef_l2_norm = np.sqrt(np.sum(coefficients ** 2))
        print(f"  系数L2范数: {coef_l2_norm:.4f}")
        print(f"  正则化项: {ridge.getRegParam() * coef_l2_norm ** 2:.4f}")
        
        return {
            'model': ridge_model,
            'predictions': predictions,
            'scaler_model': scaler_model,
            'feature_cols': feature_cols,
            'metrics': {
                'rmse': rmse,
                'mae': mae,
                'r2': r2,
                'train_rmse': train_rmse,
                'train_r2': train_r2
            },
            'coefficients': dict(zip(feature_cols, coefficients)),
            'intercept': intercept,
            'coef_l2_norm': coef_l2_norm
        }
    
    def compare_regularization(self, df):
        """
        比较不同正则化强度的效果
        """
        print("\n=== 正则化强度比较 ===")
        
        # 特征工程
        feature_cols = [
            "pe_ratio", "pb_ratio", "roe", "market_cap", "volume",
            "volatility", "rsi", "macd", "gdp_growth", "inflation",
            "interest_rate", "industry"
        ]
        
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="raw_features")
        scaler = StandardScaler(inputCol="raw_features", outputCol="features", withStd=True, withMean=True)
        
        df_assembled = assembler.transform(df)
        scaler_model = scaler.fit(df_assembled)
        df_features = scaler_model.transform(df_assembled)
        
        train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)
        
        # 不同的正则化参数
        reg_params = [0.0, 0.01, 0.1, 1.0, 10.0, 100.0]
        results = []
        
        evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction")
        
        for reg_param in reg_params:
            print(f"\n  测试正则化参数: {reg_param}")
            
            # 创建模型
            lr = LinearRegression(
                featuresCol="features",
                labelCol="price",
                regParam=reg_param,
                elasticNetParam=0.0,  # 纯L2正则化
                maxIter=100
            )
            
            # 训练模型
            model = lr.fit(train_data)
            
            # 预测
            train_pred = model.transform(train_data)
            test_pred = model.transform(test_data)
            
            # 评估
            train_rmse = evaluator.evaluate(train_pred, {evaluator.metricName: "rmse"})
            test_rmse = evaluator.evaluate(test_pred, {evaluator.metricName: "rmse"})
            train_r2 = evaluator.evaluate(train_pred, {evaluator.metricName: "r2"})
            test_r2 = evaluator.evaluate(test_pred, {evaluator.metricName: "r2"})
            
            # 系数分析
            coefficients = model.coefficients.toArray()
            coef_l2_norm = np.sqrt(np.sum(coefficients ** 2))
            
            result = {
                'reg_param': reg_param,
                'train_rmse': train_rmse,
                'test_rmse': test_rmse,
                'train_r2': train_r2,
                'test_r2': test_r2,
                'coef_l2_norm': coef_l2_norm,
                'coefficients': coefficients
            }
            
            results.append(result)
            
            print(f"    训练RMSE: {train_rmse:.4f}, 测试RMSE: {test_rmse:.4f}")
            print(f"    训练R²: {train_r2:.4f}, 测试R²: {test_r2:.4f}")
            print(f"    系数L2范数: {coef_l2_norm:.4f}")
        
        return results
    
    def visualize_ridge_results(self, model_results, comparison_results=None):
        """
        可视化岭回归结果
        """
        import matplotlib.pyplot as plt
        import seaborn as sns
        import numpy as np
        import pandas as pd
        
        # 设置中文字体
        plt.rcParams['font.sans-serif'] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 创建图形
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('岭回归模型分析结果', fontsize=16, fontweight='bold')
        
        predictions = model_results['predictions']
        
        # 1. 预测值 vs 实际值
        pred_actual = predictions.select("prediction", "price").toPandas()
        axes[0, 0].scatter(pred_actual['price'], pred_actual['prediction'], alpha=0.6)
        axes[0, 0].plot([pred_actual['price'].min(), pred_actual['price'].max()], 
                       [pred_actual['price'].min(), pred_actual['price'].max()], 'r--', lw=2)
        axes[0, 0].set_xlabel('实际价格')
        axes[0, 0].set_ylabel('预测价格')
        axes[0, 0].set_title('预测值 vs 实际值')
        axes[0, 0].grid(True, alpha=0.3)
        
        # 2. 残差分析
        residuals = pred_actual['price'] - pred_actual['prediction']
        axes[0, 1].scatter(pred_actual['prediction'], residuals, alpha=0.6)
        axes[0, 1].axhline(y=0, color='r', linestyle='--')
        axes[0, 1].set_xlabel('预测价格')
        axes[0, 1].set_ylabel('残差')
        axes[0, 1].set_title('残差分析')
        axes[0, 1].grid(True, alpha=0.3)
        
        # 3. 特征系数
        feature_names = list(model_results['coefficients'].keys())
        coefficients = list(model_results['coefficients'].values())
        
        # 按绝对值排序
        sorted_idx = np.argsort(np.abs(coefficients))[::-1]
        sorted_features = [feature_names[i] for i in sorted_idx]
        sorted_coefs = [coefficients[i] for i in sorted_idx]
        
        colors = ['red' if x < 0 else 'blue' for x in sorted_coefs]
        bars = axes[0, 2].bar(range(len(sorted_features)), sorted_coefs, color=colors, alpha=0.7)
        axes[0, 2].set_xticks(range(len(sorted_features)))
        axes[0, 2].set_xticklabels(sorted_features, rotation=45)
        axes[0, 2].set_ylabel('系数值')
        axes[0, 2].set_title('特征系数(岭回归)')
        axes[0, 2].axhline(y=0, color='black', linestyle='-', alpha=0.3)
        axes[0, 2].grid(True, alpha=0.3)
        
        # 4. 正则化路径(如果有比较结果)
        if comparison_results:
            reg_params = [r['reg_param'] for r in comparison_results]
            train_rmse = [r['train_rmse'] for r in comparison_results]
            test_rmse = [r['test_rmse'] for r in comparison_results]
            
            axes[1, 0].semilogx(reg_params, train_rmse, 'bo-', label='训练RMSE', linewidth=2)
            axes[1, 0].semilogx(reg_params, test_rmse, 'ro-', label='测试RMSE', linewidth=2)
            axes[1, 0].set_xlabel('正则化参数')
            axes[1, 0].set_ylabel('RMSE')
            axes[1, 0].set_title('正则化路径')
            axes[1, 0].legend()
            axes[1, 0].grid(True, alpha=0.3)
        
        # 5. 系数收缩效果
        if comparison_results:
            reg_params = [r['reg_param'] for r in comparison_results]
            coef_norms = [r['coef_l2_norm'] for r in comparison_results]
            
            axes[1, 1].semilogx(reg_params, coef_norms, 'go-', linewidth=2, markersize=6)
            axes[1, 1].set_xlabel('正则化参数')
            axes[1, 1].set_ylabel('系数L2范数')
            axes[1, 1].set_title('系数收缩效果')
            axes[1, 1].grid(True, alpha=0.3)
        
        # 6. 模型性能指标
        metrics = model_results['metrics']
        metric_names = ['RMSE', 'MAE', 'R²']
        metric_values = [metrics['rmse'], metrics['mae'], metrics['r2']]
        colors = ['red', 'orange', 'green']
        
        bars = axes[1, 2].bar(metric_names, metric_values, color=colors, alpha=0.7)
        axes[1, 2].set_ylabel('指标值')
        axes[1, 2].set_title('模型性能指标')
        axes[1, 2].grid(True, alpha=0.3)
        
        # 添加数值标签
        for bar, value in zip(bars, metric_values):
            height = bar.get_height()
            axes[1, 2].text(bar.get_x() + bar.get_width()/2., height,
                           f'{value:.4f}', ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()
        
        # 输出详细分析
        print("\n=== 岭回归模型分析报告 ===")
        print(f"模型性能指标:")
        print(f"  - RMSE: {metrics['rmse']:.4f}")
        print(f"  - MAE: {metrics['mae']:.4f}")
        print(f"  - R²: {metrics['r2']:.4f}")
        
        print(f"\n正则化效果:")
        print(f"  - 系数L2范数: {model_results['coef_l2_norm']:.4f}")
        print(f"  - 训练集R²: {metrics['train_r2']:.4f}")
        print(f"  - 测试集R²: {metrics['r2']:.4f}")
        print(f"  - 过拟合程度: {abs(metrics['train_r2'] - metrics['r2']):.4f}")
        
        if comparison_results:
            best_result = min(comparison_results, key=lambda x: x['test_rmse'])
            print(f"\n最佳正则化参数分析:")
            print(f"  - 最佳正则化参数: {best_result['reg_param']}")
            print(f"  - 最佳测试RMSE: {best_result['test_rmse']:.4f}")
            print(f"  - 对应系数范数: {best_result['coef_l2_norm']:.4f}")
        
        return {
            'residuals_mean': residuals.mean(),
            'residuals_std': residuals.std(),
            'overfitting_degree': abs(metrics['train_r2'] - metrics['r2'])
        }

# 演示代码
if __name__ == "__main__":
    # 创建演示实例
    demo = RidgeRegressionDemo()
    
    # 1. 创建数据集
    print("=== 创建股票价格预测数据集 ===")
    df = demo.create_dataset(n_samples=1500)
    
    # 2. 训练岭回归模型
    model_results = demo.train_ridge_regression(df)
    
    # 3. 比较不同正则化强度
    comparison_results = demo.compare_regularization(df)
    
    # 4. 可视化结果
    demo.visualize_ridge_results(model_results, comparison_results)
    
    print("\n=== 岭回归演示完成 ===")

总结:

岭回归通过L2正则化改进了线性回归,具有以下特点:

  1. 优点

    • 有效防止过拟合
    • 处理多重共线性问题
    • 系数更稳定
    • 适合高维数据
  2. 缺点

    • 不能进行特征选择
    • 需要调优正则化参数
    • 系数解释性略有降低
  3. 适用场景

    • 特征数量较多
    • 存在多重共线性
    • 需要防止过拟合
    • 数据噪声较大
  4. 业务建议

    • 股票价格预测中,市值、ROE、宏观经济指标是重要因素
    • 正则化有效控制了模型复杂度
    • 建议结合领域知识进行特征选择

4.3 决策树回归

决策树回归通过构建决策树来进行预测,能够捕捉非线性关系。

from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np

class DecisionTreeRegressionDemo:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("DecisionTreeRegressionDemo") \
            .getOrCreate()
    
    def create_dataset(self, n_samples=2000):
        """
        创建销售预测数据集
        """
        print(f"\n创建销售预测数据集 (样本数: {n_samples})")
        
        np.random.seed(42)
        
        # 生成产品ID
        product_ids = [f"PROD_{i:04d}" for i in range(1, n_samples + 1)]
        
        # 产品特征
        prices = np.random.lognormal(3, 0.8, n_samples)  # 价格
        prices = np.clip(prices, 10, 1000)
        
        advertising_budgets = np.random.lognormal(8, 1.2, n_samples)  # 广告预算
        
        # 季节性因素(1-4季度)
        seasons = np.random.randint(1, 5, n_samples)
        
        # 产品类别(1-5类)
        categories = np.random.randint(1, 6, n_samples)
        
        # 竞争对手数量
        competitors = np.random.poisson(3, n_samples)
        competitors = np.clip(competitors, 0, 10)
        
        # 市场成熟度(1-10)
        market_maturity = np.random.randint(1, 11, n_samples)
        
        # 品牌知名度(1-10)
        brand_awareness = np.random.randint(1, 11, n_samples)
        
        # 产品质量评分(1-10)
        quality_scores = np.random.normal(7, 1.5, n_samples)
        quality_scores = np.clip(quality_scores, 1, 10)
        
        # 促销活动(0或1)
        promotions = np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
        
        # 在线销售渠道(0或1)
        online_channels = np.random.choice([0, 1], n_samples, p=[0.4, 0.6])
        
        # 生成销售额(基于复杂的非线性关系)
        base_sales = np.zeros(n_samples)
        
        for i in range(n_samples):
            # 价格效应(非线性)
            if prices[i] < 50:
                price_effect = prices[i] * 2
            elif prices[i] < 200:
                price_effect = 100 + (prices[i] - 50) * 1.5
            else:
                price_effect = 325 + (prices[i] - 200) * 0.8
            
            # 广告效应(对数关系)
            ad_effect = np.log(advertising_budgets[i] + 1) * 50
            
            # 季节性效应
            season_multipliers = {1: 0.8, 2: 1.0, 3: 1.2, 4: 1.5}
            season_effect = season_multipliers[seasons[i]]
            
            # 类别效应
            category_multipliers = {1: 0.9, 2: 1.1, 3: 1.0, 4: 1.3, 5: 0.7}
            category_effect = category_multipliers[categories[i]]
            
            # 竞争效应
            competition_effect = max(0.5, 1.0 - competitors[i] * 0.05)
            
            # 品牌和质量交互效应
            brand_quality_effect = (brand_awareness[i] * quality_scores[i]) / 50
            
            # 促销效应
            promotion_effect = 1.3 if promotions[i] else 1.0
            
            # 在线渠道效应
            online_effect = 1.2 if online_channels[i] else 1.0
            
            # 市场成熟度效应(倒U型)
            maturity_effect = 0.5 + 0.1 * market_maturity[i] - 0.005 * market_maturity[i] ** 2
            
            base_sales[i] = (
                price_effect * ad_effect * season_effect * category_effect *
                competition_effect * brand_quality_effect * promotion_effect *
                online_effect * maturity_effect
            )
        
        # 添加随机噪声
        noise = np.random.normal(0, base_sales * 0.1)
        sales = base_sales + noise
        sales = np.maximum(sales, 100)  # 最低销售额100
        
        # 创建DataFrame
        data = [
            (
                product_ids[i], float(prices[i]), float(advertising_budgets[i]),
                int(seasons[i]), int(categories[i]), int(competitors[i]),
                int(market_maturity[i]), int(brand_awareness[i]), float(quality_scores[i]),
                int(promotions[i]), int(online_channels[i]), float(sales[i])
            )
            for i in range(n_samples)
        ]
        
        schema = StructType([
            StructField("product_id", StringType(), True),
            StructField("price", DoubleType(), True),
            StructField("advertising_budget", DoubleType(), True),
            StructField("season", IntegerType(), True),
            StructField("category", IntegerType(), True),
            StructField("competitors", IntegerType(), True),
            StructField("market_maturity", IntegerType(), True),
            StructField("brand_awareness", IntegerType(), True),
            StructField("quality_score", DoubleType(), True),
            StructField("promotion", IntegerType(), True),
            StructField("online_channel", IntegerType(), True),
            StructField("sales", DoubleType(), True)
        ])
        
        self.df = self.spark.createDataFrame(data, schema)
        
        print(f"  生成数据集大小: {self.df.count()} 行, {len(self.df.columns)} 列")
        
        # 显示数据概览
        print("\n  数据概览:")
        self.df.show(10)
        
        # 显示数据统计
        print("\n  数据统计:")
        self.df.describe().show()
        
        return self.df
    
    def compare_tree_depths(self, df):
        """
        比较不同树深度的性能
        """
        print("\n=== 比较不同树深度的性能 ===")
        
        # 准备数据
        education_indexer = StringIndexer(inputCol="education", outputCol="education_index")
        label_indexer = StringIndexer(inputCol="performance_level", outputCol="label")
        assembler = VectorAssembler(
            inputCols=["age", "education_index", "experience", "performance_score", "rating"],
            outputCol="features"
        )
        
        df_indexed = education_indexer.fit(df).transform(df)
        df_labeled = label_indexer.fit(df_indexed).transform(df_indexed)
        df_features = assembler.transform(df_labeled)
        
        train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)
        
        # 测试不同深度
        depths = [2, 3, 4, 5, 6, 7, 8]
        results = []
        
        evaluator = MulticlassClassificationEvaluator(
            labelCol="label",
            predictionCol="prediction",
            metricName="accuracy"
        )
        
        for depth in depths:
            print(f"\n  测试深度: {depth}")
            
            dt = DecisionTreeClassifier(
                featuresCol="features",
                labelCol="label",
                maxDepth=depth
            )
            
            model = dt.fit(train_data)
            predictions = model.transform(test_data)
            
            train_accuracy = evaluator.evaluate(model.transform(train_data))
            test_accuracy = evaluator.evaluate(predictions)
            
            results.append({
                'depth': depth,
                'train_accuracy': train_accuracy,
                'test_accuracy': test_accuracy,
                'num_nodes': model.numNodes,
                'overfitting': train_accuracy - test_accuracy
            })
            
            print(f"    训练准确率: {train_accuracy:.4f}")
            print(f"    测试准确率: {test_accuracy:.4f}")
            print(f"    节点数量: {model.numNodes}")
            print(f"    过拟合程度: {train_accuracy - test_accuracy:.4f}")
        
        return results
    
    def visualize_results(self, basic_results, tuning_results, depth_results):
        """
        可视化决策树结果
        """
        print("\n=== 决策树结果可视化 ===")
        
        import matplotlib.pyplot as plt
        import seaborn as sns
        import pandas as pd
        import numpy as np
        
        # 设置中文字体
        plt.rcParams['font.sans-serif'] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 创建图形
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('决策树分析结果', fontsize=16, fontweight='bold')
        
        # 1. 基本性能指标
        metrics = basic_results['metrics']
        metric_names = list(metrics.keys())
        metric_values = list(metrics.values())
        
        axes[0, 0].bar(metric_names, metric_values, color=['skyblue', 'lightgreen', 'lightcoral', 'gold'])
        axes[0, 0].set_title('决策树性能指标')
        axes[0, 0].set_ylabel('分数')
        axes[0, 0].set_ylim(0, 1)
        axes[0, 0].tick_params(axis='x', rotation=45)
        
        # 添加数值标签
        for i, v in enumerate(metric_values):
            axes[0, 0].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
        
        # 2. 特征重要性
        feature_importance = basic_results['feature_importances']
        features = list(feature_importance.keys())
        importances = list(feature_importance.values())
        
        # 按重要性排序
        sorted_indices = np.argsort(importances)[::-1]
        sorted_features = [features[i] for i in sorted_indices]
        sorted_importances = [importances[i] for i in sorted_indices]
        
        axes[0, 1].barh(sorted_features, sorted_importances)
        axes[0, 1].set_title('特征重要性')
        axes[0, 1].set_xlabel('重要性分数')
        
        # 3. 超参数调优结果
        tuning_metrics = tuning_results['avg_metrics']
        param_combinations = [f'组合{i+1}' for i in range(len(tuning_metrics))]
        
        axes[0, 2].plot(param_combinations, tuning_metrics, marker='o', linewidth=2, markersize=8)
        axes[0, 2].set_title('超参数调优结果')
        axes[0, 2].set_ylabel('交叉验证准确率')
        axes[0, 2].tick_params(axis='x', rotation=45)
        axes[0, 2].grid(True, alpha=0.3)
        
        # 标记最佳点
        best_idx = np.argmax(tuning_metrics)
        axes[0, 2].scatter(best_idx, tuning_metrics[best_idx], color='red', s=100, zorder=5)
        axes[0, 2].annotate(f'最佳: {tuning_metrics[best_idx]:.3f}', 
                           xy=(best_idx, tuning_metrics[best_idx]),
                           xytext=(10, 10), textcoords='offset points',
                           bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7),
                           arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))
        
        # 4. 树深度对比
        depths = [r['depth'] for r in depth_results]
        train_accs = [r['train_accuracy'] for r in depth_results]
        test_accs = [r['test_accuracy'] for r in depth_results]
        
        axes[1, 0].plot(depths, train_accs, marker='o', label='训练准确率', linewidth=2)
        axes[1, 0].plot(depths, test_accs, marker='s', label='测试准确率', linewidth=2)
        axes[1, 0].set_title('树深度 vs 准确率')
        axes[1, 0].set_xlabel('树深度')
        axes[1, 0].set_ylabel('准确率')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
        
        # 5. 过拟合分析
        overfitting = [r['overfitting'] for r in depth_results]
        
        axes[1, 1].bar(depths, overfitting, color='orange', alpha=0.7)
        axes[1, 1].set_title('过拟合程度分析')
        axes[1, 1].set_xlabel('树深度')
        axes[1, 1].set_ylabel('过拟合程度 (训练-测试)')
        axes[1, 1].axhline(y=0, color='red', linestyle='--', alpha=0.5)
        
        # 6. 模型复杂度
        num_nodes = [r['num_nodes'] for r in depth_results]
        
        axes[1, 2].plot(depths, num_nodes, marker='o', linewidth=2, markersize=8, color='purple')
        axes[1, 2].set_title('模型复杂度(节点数量)')
        axes[1, 2].set_xlabel('树深度')
        axes[1, 2].set_ylabel('节点数量')
        axes[1, 2].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # 打印总结
        print("\n=== 决策树分析总结 ===")
        print("1. 基本性能:")
        print(f"   - 准确率: {metrics['accuracy']:.4f}")
        print(f"   - F1分数: {metrics['f1']:.4f}")
        
        print("\n2. 特征重要性(前3名):")
        for i, (feature, importance) in enumerate(zip(sorted_features[:3], sorted_importances[:3])):
            print(f"   {i+1}. {feature}: {importance:.4f}")
        
        print("\n3. 最佳超参数:")
        best_model = tuning_results['best_model']
        best_dt = best_model.stages[-1]
        print(f"   - 最大深度: {best_dt.getMaxDepth()}")
        print(f"   - 最小实例数: {best_dt.getMinInstancesPerNode()}")
        print(f"   - 不纯度度量: {best_dt.getImpurity()}")
        print(f"   - 最佳分数: {tuning_results['best_score']:.4f}")
        
        print("\n4. 深度分析:")
        best_depth_result = max(depth_results, key=lambda x: x['test_accuracy'])
        print(f"   - 最佳深度: {best_depth_result['depth']}")
        print(f"   - 对应测试准确率: {best_depth_result['test_accuracy']:.4f}")
        print(f"   - 过拟合程度: {best_depth_result['overfitting']:.4f}")
        
        print("\n5. 决策树特点:")
        print("   - 易于理解和解释")
        print("   - 不需要特征缩放")
        print("   - 能处理数值和分类特征")
        print("   - 容易过拟合,需要剪枝")
        print("   - 对噪声敏感")

# 演示决策树
if __name__ == "__main__":
    # 创建决策树演示器
    dt_demo = DecisionTreeDemo()
    
    # 创建示例数据
    df = dt_demo.create_sample_data()
    
    print("=== 决策树演示 ===")
    
    # 1. 基本决策树训练
    basic_results = dt_demo.train_decision_tree(df)
    
    # 2. 超参数调优
    tuning_results = dt_demo.hyperparameter_tuning(df)
    
    # 3. 深度对比分析
    depth_results = dt_demo.compare_tree_depths(df)
    
    # 4. 结果可视化
    dt_demo.visualize_results(basic_results, tuning_results, depth_results)
    
    # 停止Spark会话
    dt_demo.spark.stop()

3.3 随机森林

随机森林是一种集成学习方法,通过构建多个决策树并结合它们的预测来提高模型性能。

3.3.1 随机森林原理

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
import pandas as pd

class RandomForestDemo:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("RandomForestDemo") \
            .config("spark.sql.adaptive.enabled", "true") \
            .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
            .getOrCreate()
        
        self.spark.sparkContext.setLogLevel("WARN")
    
    def create_sample_data(self):
        """
        创建客户流失预测示例数据
        """
        print("\n=== 创建客户流失预测数据集 ===")
        
        # 设置随机种子
        np.random.seed(42)
        
        # 生成客户数据
        n_samples = 2000
        
        # 客户基本信息
        customer_ids = [f"C{i:06d}" for i in range(1, n_samples + 1)]
        ages = np.random.normal(40, 12, n_samples).astype(int)
        ages = np.clip(ages, 18, 80)
        
        genders = np.random.choice(['Male', 'Female'], n_samples, p=[0.52, 0.48])
        
        # 服务信息
        tenure_months = np.random.exponential(24, n_samples).astype(int)
        tenure_months = np.clip(tenure_months, 1, 72)
        
        contract_types = np.random.choice(
            ['Month-to-month', 'One year', 'Two year'], 
            n_samples, 
            p=[0.5, 0.3, 0.2]
        )
        
        # 使用服务
        internet_service = np.random.choice(
            ['DSL', 'Fiber optic', 'No'], 
            n_samples, 
            p=[0.4, 0.45, 0.15]
        )
        
        online_security = np.random.choice(['Yes', 'No'], n_samples, p=[0.3, 0.7])
        tech_support = np.random.choice(['Yes', 'No'], n_samples, p=[0.35, 0.65])
        
        # 费用信息
        monthly_charges = np.random.normal(65, 20, n_samples)
        monthly_charges = np.clip(monthly_charges, 20, 120)
        
        total_charges = monthly_charges * tenure_months + np.random.normal(0, 50, n_samples)
        total_charges = np.maximum(total_charges, monthly_charges)
        
        # 客户满意度和投诉
        satisfaction_scores = np.random.normal(3.5, 1.2, n_samples)
        satisfaction_scores = np.clip(satisfaction_scores, 1, 5)
        
        complaint_count = np.random.poisson(1.5, n_samples)
        complaint_count = np.clip(complaint_count, 0, 10)
        
        # 生成流失标签(基于业务逻辑)
        churn_prob = (
            0.1 +  # 基础流失率
            0.3 * (contract_types == 'Month-to-month') +  # 月付合同更容易流失
            0.2 * (satisfaction_scores < 2.5) +  # 低满意度
            0.15 * (complaint_count > 3) +  # 投诉多
            0.1 * (tenure_months < 6) +  # 新客户
            0.05 * (monthly_charges > 80)  # 高费用
        )
        
        # 添加随机性
        churn_prob += np.random.normal(0, 0.1, n_samples)
        churn_prob = np.clip(churn_prob, 0, 1)
        
        churn = np.random.binomial(1, churn_prob, n_samples)
        churn_labels = ['No' if c == 0 else 'Yes' for c in churn]
        
        # 创建DataFrame
        data = [
            (
                customer_ids[i], int(ages[i]), genders[i], int(tenure_months[i]),
                contract_types[i], internet_service[i], online_security[i], tech_support[i],
                float(monthly_charges[i]), float(total_charges[i]), 
                float(satisfaction_scores[i]), int(complaint_count[i]), churn_labels[i]
            )
            for i in range(n_samples)
        ]
        
        schema = StructType([
            StructField("customer_id", StringType(), True),
            StructField("age", IntegerType(), True),
            StructField("gender", StringType(), True),
            StructField("tenure_months", IntegerType(), True),
            StructField("contract_type", StringType(), True),
            StructField("internet_service", StringType(), True),
            StructField("online_security", StringType(), True),
            StructField("tech_support", StringType(), True),
            StructField("monthly_charges", DoubleType(), True),
            StructField("total_charges", DoubleType(), True),
            StructField("satisfaction_score", DoubleType(), True),
            StructField("complaint_count", IntegerType(), True),
            StructField("churn", StringType(), True)
        ])
        
        self.df = self.spark.createDataFrame(data, schema)
        
        print(f"  生成数据集大小: {self.df.count()} 行, {len(self.df.columns)} 列")
        
        # 显示数据概览
        print("\n  数据概览:")
        self.df.show(10)
        
        # 显示数据统计
        print("\n  数据统计:")
        self.df.describe().show()
        
        # 流失率分布
        print("\n  流失率分布:")
        self.df.groupBy("churn").count().orderBy("churn").show()
        
        return self.df
    
    def train_random_forest(self, df):
        """
        训练随机森林模型
        """
        print("\n=== 随机森林模型训练 ===")
        
        # 1. 特征工程
        print("\n1. 特征工程")
        
        # 字符串索引化
        gender_indexer = StringIndexer(inputCol="gender", outputCol="gender_index")
        contract_indexer = StringIndexer(inputCol="contract_type", outputCol="contract_index")
        internet_indexer = StringIndexer(inputCol="internet_service", outputCol="internet_index")
        security_indexer = StringIndexer(inputCol="online_security", outputCol="security_index")
        support_indexer = StringIndexer(inputCol="tech_support", outputCol="support_index")
        label_indexer = StringIndexer(inputCol="churn", outputCol="label")
        
        # 组装特征向量
        assembler = VectorAssembler(
            inputCols=[
                "age", "tenure_months", "monthly_charges", "total_charges",
                "satisfaction_score", "complaint_count", "gender_index",
                "contract_index", "internet_index", "security_index", "support_index"
            ],
            outputCol="features"
        )
        
        # 应用转换
        df_processed = gender_indexer.fit(df).transform(df)
        df_processed = contract_indexer.fit(df_processed).transform(df_processed)
        df_processed = internet_indexer.fit(df_processed).transform(df_processed)
        df_processed = security_indexer.fit(df_processed).transform(df_processed)
        df_processed = support_indexer.fit(df_processed).transform(df_processed)
        df_processed = label_indexer.fit(df_processed).transform(df_processed)
        df_features = assembler.transform(df_processed)
        
        print("  特征工程完成")
        print(f"  特征维度: {len(assembler.getInputCols())}")
        
        # 2. 数据分割
        print("\n2. 数据分割")
        train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)
        
        print(f"  训练集大小: {train_data.count()}")
        print(f"  测试集大小: {test_data.count()}")
        
        # 3. 创建随机森林模型
        print("\n3. 创建随机森林模型")
        
        rf = RandomForestClassifier(
            featuresCol="features",
            labelCol="label",
            numTrees=100,
            maxDepth=5,
            minInstancesPerNode=1,
            minInfoGain=0.0,
            maxBins=32,
            seed=42
        )
        
        # 4. 训练模型
        print("\n4. 训练模型")
        rf_model = rf.fit(train_data)
        
        print("  模型训练完成")
        print(f"  树的数量: {rf_model.getNumTrees}")
        print(f"  特征子集策略: {rf_model.getFeatureSubsetStrategy()}")
        
        # 5. 特征重要性
        print("\n5. 特征重要性")
        feature_names = assembler.getInputCols()
        importances = rf_model.featureImportances.toArray()
        
        print("  特征重要性排序:")
        feature_importance_pairs = list(zip(feature_names, importances))
        feature_importance_pairs.sort(key=lambda x: x[1], reverse=True)
        
        for name, importance in feature_importance_pairs:
            print(f"    {name}: {importance:.4f}")
        
        # 6. 模型预测
        print("\n6. 模型预测")
        predictions = rf_model.transform(test_data)
        
        print("  预测结果样例:")
        predictions.select(
            "customer_id", "age", "tenure_months", "monthly_charges",
            "churn", "label", "prediction", "probability"
        ).show(10, truncate=False)
        
        # 7. 模型评估
        print("\n7. 模型评估")
        
        evaluator = MulticlassClassificationEvaluator(
            labelCol="label",
            predictionCol="prediction"
        )
        
        accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
        precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
        recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
        f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
        
        print(f"  准确率: {accuracy:.4f}")
        print(f"  加权精确率: {precision:.4f}")
        print(f"  加权召回率: {recall:.4f}")
        print(f"  加权F1分数: {f1:.4f}")
        
        # 8. 混淆矩阵
        print("\n8. 混淆矩阵")
        confusion_matrix = predictions.groupBy("label", "prediction").count().orderBy("label", "prediction")
        confusion_matrix.show()
        
        # 9. AUC评估(二分类)
        from pyspark.ml.evaluation import BinaryClassificationEvaluator
        binary_evaluator = BinaryClassificationEvaluator(
            labelCol="label",
            rawPredictionCol="rawPrediction",
            metricName="areaUnderROC"
        )
        auc = binary_evaluator.evaluate(predictions)
        print(f"\n  AUC: {auc:.4f}")
        
        return {
            'model': rf_model,
            'predictions': predictions,
            'metrics': {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'auc': auc
            },
            'feature_importances': dict(feature_importance_pairs)
        }
    
    def hyperparameter_tuning(self, df):
        """
        随机森林超参数调优
        """
        print("\n=== 随机森林超参数调优 ===")
        
        # 1. 创建Pipeline
        gender_indexer = StringIndexer(inputCol="gender", outputCol="gender_index")
        contract_indexer = StringIndexer(inputCol="contract_type", outputCol="contract_index")
        internet_indexer = StringIndexer(inputCol="internet_service", outputCol="internet_index")
        security_indexer = StringIndexer(inputCol="online_security", outputCol="security_index")
        support_indexer = StringIndexer(inputCol="tech_support", outputCol="support_index")
        label_indexer = StringIndexer(inputCol="churn", outputCol="label")
        
        assembler = VectorAssembler(
            inputCols=[
                "age", "tenure_months", "monthly_charges", "total_charges",
                "satisfaction_score", "complaint_count", "gender_index",
                "contract_index", "internet_index", "security_index", "support_index"
            ],
            outputCol="features"
        )
        
        rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=42)
        
        pipeline = Pipeline(stages=[
            gender_indexer, contract_indexer, internet_indexer, 
            security_indexer, support_indexer, label_indexer, assembler, rf
        ])
        
        # 2. 参数网格
        param_grid = ParamGridBuilder() \
            .addGrid(rf.numTrees, [50, 100, 200]) \
            .addGrid(rf.maxDepth, [3, 5, 7]) \
            .addGrid(rf.minInstancesPerNode, [1, 5, 10]) \
            .build()
        
        print(f"  参数组合数量: {len(param_grid)}")
        
        # 3. 交叉验证
        evaluator = BinaryClassificationEvaluator(
            labelCol="label",
            rawPredictionCol="rawPrediction",
            metricName="areaUnderROC"
        )
        
        cv = CrossValidator(
            estimator=pipeline,
            estimatorParamMaps=param_grid,
            evaluator=evaluator,
            numFolds=3,
            seed=42
        )
        
        # 4. 训练和选择最佳模型
        print("\n  开始交叉验证...")
        cv_model = cv.fit(df)
        
        # 5. 最佳参数
        best_pipeline = cv_model.bestModel
        best_rf = best_pipeline.stages[-1]  # 随机森林是Pipeline的最后一个阶段
        
        print("\n  最佳参数:")
        print(f"    numTrees: {best_rf.getNumTrees}")
        print(f"    maxDepth: {best_rf.getMaxDepth()}")
        print(f"    minInstancesPerNode: {best_rf.getMinInstancesPerNode()}")
        
        # 6. 交叉验证结果
        print("\n  交叉验证平均AUC:")
        avg_metrics = cv_model.avgMetrics
        for i, score in enumerate(avg_metrics):
            print(f"    参数组合 {i+1}: {score:.4f}")
        
        best_score = max(avg_metrics)
        print(f"\n  最佳交叉验证AUC: {best_score:.4f}")
        
        return {
            'best_model': best_pipeline,
            'cv_model': cv_model,
            'best_score': best_score,
            'avg_metrics': avg_metrics
        }
    
    def compare_tree_numbers(self, df):
        """
        比较不同树数量的性能
        """
        print("\n=== 比较不同树数量的性能 ===")
        
        # 准备数据
        gender_indexer = StringIndexer(inputCol="gender", outputCol="gender_index")
        contract_indexer = StringIndexer(inputCol="contract_type", outputCol="contract_index")
        internet_indexer = StringIndexer(inputCol="internet_service", outputCol="internet_index")
        security_indexer = StringIndexer(inputCol="online_security", outputCol="security_index")
        support_indexer = StringIndexer(inputCol="tech_support", outputCol="support_index")
        label_indexer = StringIndexer(inputCol="churn", outputCol="label")
        
        assembler = VectorAssembler(
            inputCols=[
                "age", "tenure_months", "monthly_charges", "total_charges",
                "satisfaction_score", "complaint_count", "gender_index",
                "contract_index", "internet_index", "security_index", "support_index"
            ],
            outputCol="features"
        )
        
        # 应用转换
        df_processed = gender_indexer.fit(df).transform(df)
        df_processed = contract_indexer.fit(df_processed).transform(df_processed)
        df_processed = internet_indexer.fit(df_processed).transform(df_processed)
        df_processed = security_indexer.fit(df_processed).transform(df_processed)
        df_processed = support_indexer.fit(df_processed).transform(df_processed)
        df_processed = label_indexer.fit(df_processed).transform(df_processed)
        df_features = assembler.transform(df_processed)
        
        train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)
        
        # 测试不同树数量
        tree_numbers = [10, 25, 50, 100, 150, 200]
        results = []
        
        accuracy_evaluator = MulticlassClassificationEvaluator(
            labelCol="label",
            predictionCol="prediction",
            metricName="accuracy"
        )
        
        auc_evaluator = BinaryClassificationEvaluator(
            labelCol="label",
            rawPredictionCol="rawPrediction",
            metricName="areaUnderROC"
        )
        
        for num_trees in tree_numbers:
            print(f"\n  测试树数量: {num_trees}")
            
            rf = RandomForestClassifier(
                featuresCol="features",
                labelCol="label",
                numTrees=num_trees,
                maxDepth=5,
                seed=42
            )
            
            model = rf.fit(train_data)
            predictions = model.transform(test_data)
            
            train_predictions = model.transform(train_data)
            train_accuracy = accuracy_evaluator.evaluate(train_predictions)
            train_auc = auc_evaluator.evaluate(train_predictions)
            
            test_accuracy = accuracy_evaluator.evaluate(predictions)
            test_auc = auc_evaluator.evaluate(predictions)
            
            results.append({
                'num_trees': num_trees,
                'train_accuracy': train_accuracy,
                'test_accuracy': test_accuracy,
                'train_auc': train_auc,
                'test_auc': test_auc,
                'overfitting_accuracy': train_accuracy - test_accuracy,
                'overfitting_auc': train_auc - test_auc
            })
            
            print(f"    训练准确率: {train_accuracy:.4f}")
            print(f"    测试准确率: {test_accuracy:.4f}")
            print(f"    训练AUC: {train_auc:.4f}")
            print(f"    测试AUC: {test_auc:.4f}")
        
        return results
    
    def visualize_results(self, basic_results, tuning_results, tree_results):
        """
        可视化随机森林结果
        """
        print("\n=== 随机森林结果可视化 ===")
        
        import matplotlib.pyplot as plt
        import seaborn as sns
        import pandas as pd
        import numpy as np
        
        # 设置中文字体
        plt.rcParams['font.sans-serif'] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 创建图形
        fig, axes = plt.subplots(3, 3, figsize=(20, 15))
        fig.suptitle('随机森林分析结果', fontsize=16, fontweight='bold')
        
        # 1. 基本性能指标
        metrics = basic_results['metrics']
        metric_names = list(metrics.keys())
        metric_values = list(metrics.values())
        
        axes[0, 0].bar(metric_names, metric_values, 
                      color=['skyblue', 'lightgreen', 'lightcoral', 'gold', 'orange'])
        axes[0, 0].set_title('随机森林性能指标')
        axes[0, 0].set_ylabel('分数')
        axes[0, 0].set_ylim(0, 1)
        axes[0, 0].tick_params(axis='x', rotation=45)
        
        # 添加数值标签
        for i, v in enumerate(metric_values):
            axes[0, 0].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
        
        # 2. 特征重要性
        feature_importance = basic_results['feature_importances']
        features = list(feature_importance.keys())
        importances = list(feature_importance.values())
        
        # 按重要性排序
        sorted_indices = np.argsort(importances)[::-1]
        sorted_features = [features[i] for i in sorted_indices]
        sorted_importances = [importances[i] for i in sorted_indices]
        
        axes[0, 1].barh(sorted_features, sorted_importances)
        axes[0, 1].set_title('特征重要性')
        axes[0, 1].set_xlabel('重要性分数')
        
        # 3. 超参数调优结果
        tuning_metrics = tuning_results['avg_metrics']
        param_combinations = [f'组合{i+1}' for i in range(len(tuning_metrics))]
        
        axes[0, 2].plot(param_combinations, tuning_metrics, marker='o', linewidth=2, markersize=8)
        axes[0, 2].set_title('超参数调优结果')
        axes[0, 2].set_ylabel('交叉验证AUC')
        axes[0, 2].tick_params(axis='x', rotation=45)
        axes[0, 2].grid(True, alpha=0.3)
        
        # 标记最佳点
        best_idx = np.argmax(tuning_metrics)
        axes[0, 2].scatter(best_idx, tuning_metrics[best_idx], color='red', s=100, zorder=5)
        axes[0, 2].annotate(f'最佳: {tuning_metrics[best_idx]:.3f}', 
                           xy=(best_idx, tuning_metrics[best_idx]),
                           xytext=(10, 10), textcoords='offset points',
                           bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7),
                           arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))
        
        # 4. 树数量对准确率的影响
        tree_numbers = [r['num_trees'] for r in tree_results]
        train_accs = [r['train_accuracy'] for r in tree_results]
        test_accs = [r['test_accuracy'] for r in tree_results]
        
        axes[1, 0].plot(tree_numbers, train_accs, marker='o', label='训练准确率', linewidth=2)
        axes[1, 0].plot(tree_numbers, test_accs, marker='s', label='测试准确率', linewidth=2)
        axes[1, 0].set_title('树数量 vs 准确率')
        axes[1, 0].set_xlabel('树数量')
        axes[1, 0].set_ylabel('准确率')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
        
        # 5. 树数量对AUC的影响
        train_aucs = [r['train_auc'] for r in tree_results]
        test_aucs = [r['test_auc'] for r in tree_results]
        
        axes[1, 1].plot(tree_numbers, train_aucs, marker='o', label='训练AUC', linewidth=2)
        axes[1, 1].plot(tree_numbers, test_aucs, marker='s', label='测试AUC', linewidth=2)
        axes[1, 1].set_title('树数量 vs AUC')
        axes[1, 1].set_xlabel('树数量')
        axes[1, 1].set_ylabel('AUC')
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3)
        
        # 6. 过拟合分析(准确率)
        overfitting_acc = [r['overfitting_accuracy'] for r in tree_results]
        
        axes[1, 2].bar(tree_numbers, overfitting_acc, color='orange', alpha=0.7)
        axes[1, 2].set_title('过拟合程度分析(准确率)')
        axes[1, 2].set_xlabel('树数量')
        axes[1, 2].set_ylabel('过拟合程度 (训练-测试)')
        axes[1, 2].axhline(y=0, color='red', linestyle='--', alpha=0.5)
        
        # 7. 过拟合分析(AUC)
        overfitting_auc = [r['overfitting_auc'] for r in tree_results]
        
        axes[2, 0].bar(tree_numbers, overfitting_auc, color='purple', alpha=0.7)
        axes[2, 0].set_title('过拟合程度分析(AUC)')
        axes[2, 0].set_xlabel('树数量')
        axes[2, 0].set_ylabel('过拟合程度 (训练-测试)')
        axes[2, 0].axhline(y=0, color='red', linestyle='--', alpha=0.5)
        
        # 8. 性能稳定性分析
        accuracy_variance = np.var([r['test_accuracy'] for r in tree_results])
        auc_variance = np.var([r['test_auc'] for r in tree_results])
        
        stability_metrics = ['准确率方差', 'AUC方差']
        stability_values = [accuracy_variance, auc_variance]
        
        axes[2, 1].bar(stability_metrics, stability_values, color=['lightblue', 'lightgreen'])
        axes[2, 1].set_title('性能稳定性分析')
        axes[2, 1].set_ylabel('方差')
        
        # 添加数值标签
        for i, v in enumerate(stability_values):
            axes[2, 1].text(i, v + max(stability_values) * 0.01, f'{v:.6f}', 
                           ha='center', va='bottom')
        
        # 9. 算法比较(假设与决策树对比)
        comparison_data = {
            '算法': ['决策树', '随机森林'],
            '准确率': [0.85, metrics['accuracy']],  # 假设决策树准确率
            'AUC': [0.82, metrics['auc']],  # 假设决策树AUC
            'F1分数': [0.83, metrics['f1']]  # 假设决策树F1
        }
        
        x = np.arange(len(comparison_data['算法']))
        width = 0.25
        
        axes[2, 2].bar(x - width, comparison_data['准确率'], width, label='准确率', alpha=0.8)
        axes[2, 2].bar(x, comparison_data['AUC'], width, label='AUC', alpha=0.8)
        axes[2, 2].bar(x + width, comparison_data['F1分数'], width, label='F1分数', alpha=0.8)
        
        axes[2, 2].set_title('算法性能比较')
        axes[2, 2].set_ylabel('分数')
        axes[2, 2].set_xticks(x)
        axes[2, 2].set_xticklabels(comparison_data['算法'])
        axes[2, 2].legend()
        axes[2, 2].set_ylim(0, 1)
        
        plt.tight_layout()
        plt.show()
        
        # 打印总结
        print("\n=== 随机森林分析总结 ===")
        print("1. 基本性能:")
        print(f"   - 准确率: {metrics['accuracy']:.4f}")
        print(f"   - AUC: {metrics['auc']:.4f}")
        print(f"   - F1分数: {metrics['f1']:.4f}")
        
        print("\n2. 特征重要性(前5名):")
        for i, (feature, importance) in enumerate(zip(sorted_features[:5], sorted_importances[:5])):
            print(f"   {i+1}. {feature}: {importance:.4f}")
        
        print("\n3. 最佳超参数:")
        best_model = tuning_results['best_model']
        best_rf = best_model.stages[-1]
        print(f"   - 树数量: {best_rf.getNumTrees}")
        print(f"   - 最大深度: {best_rf.getMaxDepth()}")
        print(f"   - 最小实例数: {best_rf.getMinInstancesPerNode()}")
        print(f"   - 最佳AUC: {tuning_results['best_score']:.4f}")
        
        print("\n4. 树数量分析:")
        best_tree_result = max(tree_results, key=lambda x: x['test_auc'])
        print(f"   - 最佳树数量: {best_tree_result['num_trees']}")
        print(f"   - 对应测试AUC: {best_tree_result['test_auc']:.4f}")
        print(f"   - 过拟合程度(AUC): {best_tree_result['overfitting_auc']:.4f}")
        
        print("\n5. 随机森林优势:")
        print("   - 减少过拟合风险")
        print("   - 提供特征重要性")
        print("   - 处理缺失值能力强")
        print("   - 对异常值不敏感")
        print("   - 可并行训练")
        
        print("\n6. 业务建议:")
        # 基于特征重要性给出业务建议
        top_features = sorted_features[:3]
        if 'satisfaction_score' in top_features:
            print("   - 客户满意度是流失的重要因素,建议加强客户服务")
        if 'tenure_months' in top_features:
            print("   - 客户使用时长影响流失,建议针对新客户制定留存策略")
        if 'monthly_charges' in top_features:
            print("   - 月费用是重要因素,建议优化定价策略")
        if 'complaint_count' in top_features:
            print("   - 投诉次数影响流失,建议建立有效的投诉处理机制")

# 演示随机森林
if __name__ == "__main__":
    # 创建随机森林演示器
    rf_demo = RandomForestDemo()
    
    # 创建示例数据
    df = rf_demo.create_sample_data()
    
    print("=== 随机森林演示 ===")
    
    # 1. 基本随机森林训练
    basic_results = rf_demo.train_random_forest(df)
    
    # 2. 超参数调优
    tuning_results = rf_demo.hyperparameter_tuning(df)
    
    # 3. 树数量对比分析
    tree_results = rf_demo.compare_tree_numbers(df)
    
    # 4. 结果可视化
    rf_demo.visualize_results(basic_results, tuning_results, tree_results)
    
    # 停止Spark会话
    rf_demo.spark.stop()

4. 回归算法

回归算法用于预测连续数值,是机器学习中的重要分支。

4.1 线性回归

线性回归是最基础的回归算法,通过线性关系建模输入特征与目标变量的关系。

4.1.1 线性回归原理

”`python from pyspark.ml.regression import LinearRegression from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.feature import VectorAssembler, StandardScaler from pyspark.ml.tuning import ParamGridBuilder, CrossValidator from pyspark.ml import Pipeline from pyspark.sql import SparkSession from pyspark.sql.functions import * from pyspark.sql.types import * import numpy as np import pandas as pd

class LinearRegressionDemo: def init(self): self.spark = SparkSession.builder
.appName(“LinearRegressionDemo”)
.config(“spark.sql.adaptive.enabled”, “true”)
.config(“spark.sql.adaptive.coalescePartitions.enabled”, “true”)
.getOrCreate()

    self.spark.sparkContext.setLogLevel("WARN")

def create_sample_data(self):
    """
    创建房价预测示例数据
    """
    print("\n=== 创建房价预测数据集 ===")

    # 设置随机种子
    np.random.seed(42)

    # 生成房屋数据
    n_samples = 1500

    # 房屋基本信息
    house_ids = [f"H{i:06d}" for i in range(1, n_samples + 1)]

    # 面积(平方米)
    areas = np.random.normal(120, 40, n_samples)
    areas = np.clip(areas, 50, 300)

    # 房间数量
    bedrooms = np.random.poisson(3, n_samples)
    bedrooms = np.clip(bedrooms, 1, 6)

    bathrooms = np.random.poisson(2, n_samples)
    bathrooms = np.clip(bathrooms, 1, 4)

    # 楼层
    floors = np.random.randint(1, 21, n_samples)

    # 建造年份
    build_years = np.random.randint(1980, 2024, n_samples)
    house_ages = 2024 - build_years

    # 地理位置评分(1-10)
    location_scores = np.random.normal(6, 2, n_samples)
    location_scores = np.clip(location_scores, 1, 10)

    # 学区评分(1-10)
    school_scores = np.random.normal(7, 1.5, n_samples)
    school_scores = np.clip(school_scores, 1, 10)

    # 交通便利性评分(1-10)
    transport_scores = np.random.normal(6.5, 1.8, n_samples)
    transport_scores = np.clip(transport_scores, 1, 10)

    # 装修等级(1-5)
    decoration_levels = np.random.randint(1, 6, n_samples)

    # 是否有车库
    has_garage = np.random.choice([0, 1], n_samples, p=[0.4, 0.6])

    # 是否有花园
    has_garden = np.random.choice([0, 1], n_samples, p=[0.6, 0.4])

    # 生成房价(基于特征的线性组合加噪声)
    base_price = (
        areas * 8000 +  # 面积影响最大
        bedrooms * 50000 +  # 房间数
        bathrooms * 30000 +  # 卫生间数
        location_scores * 80000 +  # 地理位置
        school_scores * 60000 +  # 学区
        transport_scores * 40000 +  # 交通
        decoration_levels * 100000 +  # 装修
        has_garage * 150000 +  # 车库
        has_garden * 100000 -  # 花园
        house_ages * 5000  # 房龄(负影响)
    )

    # 添加楼层影响(中间楼层更贵)
    floor_bonus = np.where(
        (floors >= 5) & (floors <= 15), 
        50000, 
        np.where(floors > 15, -20000, -30000)
    )
    base_price += floor_bonus

    # 添加随机噪声
    noise = np.random.normal(0, 200000, n_samples)
    prices = base_price + noise
    prices = np.maximum(prices, 300000)  # 最低价格30万

    # 创建DataFrame
    data = [
        (
            house_ids[i], float(areas[i]), int(bedrooms[i]), int(bathrooms[i]),
            int(floors[i]), int(build_years[i]), int(house_ages[i]),
            float(location_scores[i]), float(school_scores[i]), float(transport_scores[i]),
            int(decoration_levels[i]), int(has_garage[i]), int(has_garden[i]), float(prices[i])
        )
        for i in range(n_samples)
    ]

    schema = StructType([
        StructField("house_id", StringType(), True),
        StructField("area", DoubleType(), True),
        StructField("bedrooms", IntegerType(), True),
        StructField("bathrooms", IntegerType(), True),
        StructField("floor", IntegerType(), True),
        StructField("build_year", IntegerType(), True),
        StructField("house_age", IntegerType(), True),
        StructField("location_score", DoubleType(), True),
        StructField("school_score", DoubleType(), True),
        StructField("transport_score", DoubleType(), True),
        StructField("decoration_level", IntegerType(), True),
        StructField("has_garage", IntegerType(), True),
        StructField("has_garden", IntegerType(), True),
        StructField("price", DoubleType(), True)
    ])

    self.df = self.spark.createDataFrame(data, schema)

    print(f"  生成数据集大小: {self.df.count()} 行, {len(self.df.columns)} 列")

    # 显示数据概览
    print("\n  数据概览:")
    self.df.show(10)

    # 显示数据统计
    print("\n  数据统计:")
    self.df.describe().show()

    # 价格分布
    print("\n  价格分布统计:")
    self.df.select(
        min("price").alias("最低价格"),
        max("price").alias("最高价格"),
        avg("price").alias("平均价格"),
        expr("percentile_approx(price, 0.5)").alias("中位数价格")
    ).show()

    return self.df

def train_linear_regression(self, df):
    """
    训练线性回归模型
    """
    print("\n=== 线性回归模型训练 ===")

    # 1. 特征工程
    print("\n1. 特征工程")

    # 组装特征向量
    feature_cols = [
        "area", "bedrooms", "bathrooms", "floor", "house_age",
        "location_score", "school_score", "transport_score",
        "decoration_level", "has_garage", "has_garden"
    ]

    assembler = VectorAssembler(
        inputCols=feature_cols,
        outputCol="raw_features"
    )

    # 特征标准化
    scaler = StandardScaler(
        inputCol="raw_features",
        outputCol="features",
        withStd=True,
        withMean=True
    )

    # 应用转换
    df_assembled = assembler.transform(df)
    scaler_model = scaler.fit(df_assembled)
    df_features = scaler_model.transform(df_assembled)

    print("  特征工程完成")
    print(f"  特征维度: {len(feature_cols)}")

    # 2. 数据分割
    print("\n2. 数据分割")
    train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)

    print(f"  训练集大小: {train_data.count()}")
    print(f"  测试集大小: {test_data.count()}")

    # 3. 创建线性回归模型
    print("\n3. 创建线性回归模型")

    lr = LinearRegression(
        featuresCol="features",
        labelCol="price",
        regParam=0.1,  # L2正则化参数
        elasticNetParam=0.0,  # 弹性网络参数(0=L2, 1=L1)
        maxIter=100,
        tol=1e-6
    )

    # 4. 训练模型
    print("\n4. 训练模型")
    lr_model = lr.fit(train_data)

    print("  模型训练完成")
    print(f"  迭代次数: {lr_model.summary.totalIterations}")
    print(f"  目标函数值: {lr_model.summary.objectiveHistory[-1]:.6f}")

    # 5. 模型系数
    print("\n5. 模型系数")
    coefficients = lr_model.coefficients.toArray()
    intercept = lr_model.intercept

    print(f"  截距: {intercept:.2f}")
    print("  特征系数:")
    for feature, coef in zip(feature_cols, coefficients):
        print(f"    {feature}: {coef:.2f}")

    # 6. 模型预测
    print("\n6. 模型预测")
    predictions = lr_model.transform(test_data)

    print("  预测结果样例:")
    predictions.select(
        "house_id", "area", "bedrooms", "location_score",
        "price", "prediction"
    ).show(10)

    # 7. 模型评估
    print("\n7. 模型评估")

    evaluator = RegressionEvaluator(
        labelCol="price",
        predictionCol="prediction"
    )

    rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
    mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
    r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

    print(f"  RMSE (均方根误差): {rmse:.2f}")
    print(f"  MAE (平均绝对误差): {mae:.2f}")
    print(f"  R² (决定系数): {r2:.4f}")

    # 8. 训练集评估
    train_predictions = lr_model.transform(train_data)
    train_rmse = evaluator.evaluate(train_predictions, {evaluator.metricName: "rmse"})
    train_r2 = evaluator.evaluate(train_predictions, {evaluator.metricName: "r2"})

    print(f"\n  训练集性能:")
    print(f"    RMSE: {train_rmse:.2f}")
    print(f"    R²: {train_r2:.4f}")

    # 9. 残差分析
    print("\n8. 残差分析")
    residuals = predictions.withColumn(
        "residual", 
        col("price") - col("prediction")
    )

    residual_stats = residuals.select(
        avg("residual").alias("平均残差"),
        stddev("residual").alias("残差标准差"),
        min("residual").alias("最小残差"),
        max("residual").alias("最大残差")
    ).collect()[0]

    print(f"  平均残差: {residual_stats['平均残差']:.2f}")
    print(f"  残差标准差: {residual_stats['残差标准差']:.2f}")
    print(f"  残差范围: [{residual_stats['最小残差']:.2f}, {residual_stats['最大残差']:.2f}]")

    return {
        'model': lr_model,
        'predictions': predictions,
        'scaler_model': scaler_model,
        'feature_cols': feature_cols,
        'metrics': {
            'rmse': rmse,
            'mae': mae,
            'r2': r2,
            'train_rmse': train_rmse,
            'train_r2': train_r2
        },
        'coefficients': dict(zip(feature_cols, coefficients)),
        'intercept': intercept
    }

def hyperparameter_tuning(self, df):
    """
    线性回归超参数调优
    """
    print("\n=== 线性回归超参数调优 ===")

    # 1. 创建Pipeline
    feature_cols = [
        "area", "bedrooms", "bathrooms", "floor", "house_age",
        "location_score", "school_score", "transport_score",
        "decoration_level", "has_garage", "has_garden"
    ]

    assembler = VectorAssembler(inputCols=feature_cols, outputCol="raw_features")
    scaler = StandardScaler(inputCol="raw_features", outputCol="features", withStd=True, withMean=True)
    lr = LinearRegression(featuresCol="features", labelCol="price")

    pipeline = Pipeline(stages=[assembler, scaler, lr])

    # 2. 参数网格
    param_grid = ParamGridBuilder() \
        .addGrid(lr.regParam, [0.0, 0.01, 0.1, 1.0]) \
        .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
        .addGrid(lr.maxIter, [50, 100, 200]) \
        .build()

    print(f"  参数组合数量: {len(param_grid)}")

    # 3. 交叉验证
    evaluator = RegressionEvaluator(
        labelCol="price",
        predictionCol="prediction",
        metricName="rmse"
    )

    cv = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=5,
        seed=42
    )

    # 4. 训练和选择最佳模型
    print("\n  开始交叉验证...")
    cv_model = cv.fit(df)

    # 5. 最佳参数
    best_pipeline = cv_model.bestModel
    best_lr = best_pipeline.stages[-1]  # 线性回归是Pipeline的最后一个阶段

    print("\n  最佳参数:")
    print(f"    regParam: {best_lr.getRegParam()}")
    print(f"    elasticNetParam: {best_lr.getElasticNetParam()}")
    print(f"    maxIter: {best_lr.getMaxIter()}")

    # 6. 交叉验证结果
    print("\n  交叉验证平均RMSE:")
    avg_metrics = cv_model.avgMetrics
    for i, score in enumerate(avg_metrics):
        print(f"    参数组合 {i+1}: {score:.2f}")

    best_score = min(avg_metrics)
    print(f"\n  最佳交叉验证RMSE: {best_score:.2f}")

    return {
        'best_model': best_pipeline,
        'cv_model': cv_model,
        'best_score': best_score,
        'avg_metrics': avg_metrics
    }