6.1 数据导入与清洗

6.1.1 多种数据源处理

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import json
import csv
from io import StringIO
import requests
from matplotlib.dates import DateFormatter, MonthLocator
import seaborn as sns
from scipy import stats

class DataProcessingDemo:
    """数据处理与可视化演示类"""
    
    def __init__(self):
        plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 设置随机种子
        np.random.seed(42)
        
        # 生成示例数据
        self.generate_sample_data()
    
    def generate_sample_data(self):
        """生成示例数据"""
        # 1. 时间序列数据
        dates = pd.date_range('2023-01-01', periods=365, freq='D')
        self.time_series_data = pd.DataFrame({
            'date': dates,
            'temperature': 20 + 10 * np.sin(2 * np.pi * np.arange(365) / 365) + np.random.normal(0, 2, 365),
            'humidity': 60 + 20 * np.cos(2 * np.pi * np.arange(365) / 365) + np.random.normal(0, 5, 365),
            'pressure': 1013 + 10 * np.sin(4 * np.pi * np.arange(365) / 365) + np.random.normal(0, 3, 365)
        })
        
        # 2. 销售数据
        products = ['产品A', '产品B', '产品C', '产品D', '产品E']
        regions = ['北区', '南区', '东区', '西区']
        
        sales_data = []
        for month in range(1, 13):
            for product in products:
                for region in regions:
                    sales = np.random.normal(1000, 200) * (1 + 0.1 * np.sin(month * np.pi / 6))
                    sales_data.append({
                        'month': month,
                        'product': product,
                        'region': region,
                        'sales': max(0, sales),
                        'profit': sales * np.random.uniform(0.1, 0.3)
                    })
        
        self.sales_data = pd.DataFrame(sales_data)
        
        # 3. 用户行为数据
        user_ids = range(1, 1001)
        self.user_data = pd.DataFrame({
            'user_id': user_ids,
            'age': np.random.normal(35, 12, 1000).astype(int),
            'gender': np.random.choice(['男', '女'], 1000),
            'city_tier': np.random.choice(['一线', '二线', '三线', '四线'], 1000, p=[0.2, 0.3, 0.3, 0.2]),
            'monthly_spend': np.random.lognormal(6, 1, 1000),
            'session_duration': np.random.exponential(15, 1000),
            'page_views': np.random.poisson(8, 1000)
        })
        
        # 添加一些缺失值
        missing_indices = np.random.choice(1000, 50, replace=False)
        self.user_data.loc[missing_indices, 'monthly_spend'] = np.nan
        
        # 4. 股票数据模拟
        self.stock_data = self.generate_stock_data()
    
    def generate_stock_data(self):
        """生成模拟股票数据"""
        dates = pd.date_range('2023-01-01', periods=252, freq='B')  # 工作日
        
        # 模拟股票价格随机游走
        returns = np.random.normal(0.001, 0.02, 252)
        prices = [100]  # 初始价格
        
        for ret in returns:
            prices.append(prices[-1] * (1 + ret))
        
        prices = prices[1:]  # 移除初始价格
        
        # 生成成交量
        volumes = np.random.lognormal(10, 0.5, 252)
        
        return pd.DataFrame({
            'date': dates,
            'price': prices,
            'volume': volumes,
            'returns': returns
        })
    
    def data_cleaning_demo(self):
        """数据清洗演示"""
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        
        # 1. 缺失值处理
        ax1 = axes[0, 0]
        
        # 显示缺失值分布
        missing_data = self.user_data.copy()
        missing_counts = missing_data.isnull().sum()
        
        ax1.bar(range(len(missing_counts)), missing_counts.values, 
               color='red', alpha=0.7)
        ax1.set_xticks(range(len(missing_counts)))
        ax1.set_xticklabels(missing_counts.index, rotation=45)
        ax1.set_title('缺失值统计')
        ax1.set_ylabel('缺失值数量')
        ax1.grid(True, alpha=0.3)
        
        # 2. 异常值检测
        ax2 = axes[0, 1]
        
        # 使用箱线图检测异常值
        spend_data = self.user_data['monthly_spend'].dropna()
        box_plot = ax2.boxplot(spend_data, patch_artist=True)
        box_plot['boxes'][0].set_facecolor('lightblue')
        
        # 标记异常值
        Q1 = spend_data.quantile(0.25)
        Q3 = spend_data.quantile(0.75)
        IQR = Q3 - Q1
        outliers = spend_data[(spend_data < Q1 - 1.5*IQR) | (spend_data > Q3 + 1.5*IQR)]
        
        ax2.set_title(f'月消费异常值检测\n异常值数量: {len(outliers)}')
        ax2.set_ylabel('月消费金额')
        ax2.grid(True, alpha=0.3)
        
        # 3. 数据分布
        ax3 = axes[0, 2]
        
        # 原始数据分布
        ax3.hist(spend_data, bins=50, alpha=0.7, color='blue', label='原始数据')
        
        # 对数变换后的分布
        log_spend = np.log(spend_data)
        ax3_twin = ax3.twinx()
        ax3_twin.hist(log_spend, bins=50, alpha=0.5, color='red', label='对数变换')
        
        ax3.set_title('数据分布对比')
        ax3.set_xlabel('月消费金额')
        ax3.set_ylabel('频次 (原始)', color='blue')
        ax3_twin.set_ylabel('频次 (对数)', color='red')
        ax3.legend(loc='upper right')
        ax3_twin.legend(loc='upper left')
        
        # 4. 数据标准化
        ax4 = axes[1, 0]
        
        # 选择数值列进行标准化
        numeric_cols = ['age', 'monthly_spend', 'session_duration', 'page_views']
        clean_data = self.user_data[numeric_cols].dropna()
        
        # 标准化前
        ax4.boxplot([clean_data[col] for col in numeric_cols], 
                   labels=numeric_cols, patch_artist=True)
        ax4.set_title('标准化前的数据分布')
        ax4.set_ylabel('原始值')
        ax4.tick_params(axis='x', rotation=45)
        ax4.grid(True, alpha=0.3)
        
        # 5. 标准化后
        ax5 = axes[1, 1]
        
        # Z-score标准化
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(clean_data)
        
        ax5.boxplot(scaled_data.T, labels=numeric_cols, patch_artist=True)
        ax5.set_title('Z-score标准化后的数据分布')
        ax5.set_ylabel('标准化值')
        ax5.tick_params(axis='x', rotation=45)
        ax5.grid(True, alpha=0.3)
        
        # 6. 数据质量报告
        ax6 = axes[1, 2]
        
        # 计算数据质量指标
        quality_metrics = {
            '完整性': (1 - self.user_data.isnull().sum().sum() / self.user_data.size) * 100,
            '唯一性': (self.user_data['user_id'].nunique() / len(self.user_data)) * 100,
            '一致性': 95,  # 模拟值
            '准确性': 92,  # 模拟值
            '及时性': 88   # 模拟值
        }
        
        metrics = list(quality_metrics.keys())
        values = list(quality_metrics.values())
        colors = ['green' if v >= 90 else 'orange' if v >= 80 else 'red' for v in values]
        
        bars = ax6.barh(metrics, values, color=colors, alpha=0.7)
        ax6.set_title('数据质量评估')
        ax6.set_xlabel('质量分数 (%)')
        ax6.set_xlim(0, 100)
        
        # 添加数值标签
        for bar, value in zip(bars, values):
            ax6.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2, 
                    f'{value:.1f}%', va='center')
        
        ax6.grid(True, alpha=0.3, axis='x')
        
        plt.tight_layout()
        plt.show()
    
    def time_series_analysis_demo(self):
        """时间序列分析演示"""
        fig, axes = plt.subplots(3, 2, figsize=(16, 14))
        
        # 1. 原始时间序列
        ax1 = axes[0, 0]
        
        ax1.plot(self.time_series_data['date'], self.time_series_data['temperature'], 
                'b-', linewidth=1, alpha=0.7, label='温度')
        ax1.plot(self.time_series_data['date'], self.time_series_data['humidity'], 
                'r-', linewidth=1, alpha=0.7, label='湿度')
        
        ax1.set_title('原始时间序列数据')
        ax1.set_xlabel('日期')
        ax1.set_ylabel('数值')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # 格式化日期轴
        ax1.xaxis.set_major_formatter(DateFormatter('%Y-%m'))
        ax1.xaxis.set_major_locator(MonthLocator(interval=2))
        plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45)
        
        # 2. 移动平均
        ax2 = axes[0, 1]
        
        # 计算不同窗口的移动平均
        temp_data = self.time_series_data['temperature']
        ma_7 = temp_data.rolling(window=7).mean()
        ma_30 = temp_data.rolling(window=30).mean()
        
        ax2.plot(self.time_series_data['date'], temp_data, 
                'lightgray', linewidth=0.5, alpha=0.5, label='原始数据')
        ax2.plot(self.time_series_data['date'], ma_7, 
                'blue', linewidth=2, label='7日移动平均')
        ax2.plot(self.time_series_data['date'], ma_30, 
                'red', linewidth=2, label='30日移动平均')
        
        ax2.set_title('移动平均平滑')
        ax2.set_xlabel('日期')
        ax2.set_ylabel('温度')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        ax2.xaxis.set_major_formatter(DateFormatter('%Y-%m'))
        ax2.xaxis.set_major_locator(MonthLocator(interval=2))
        plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45)
        
        # 3. 季节性分解
        ax3 = axes[1, 0]
        
        # 简单的季节性分解
        from scipy.signal import detrend
        
        # 趋势 (使用长期移动平均)
        trend = temp_data.rolling(window=90, center=True).mean()
        
        # 去趋势
        detrended = temp_data - trend
        
        # 季节性 (使用周期性平均)
        seasonal = detrended.groupby(detrended.index % 365).transform('mean')
        
        # 残差
        residual = detrended - seasonal
        
        ax3.plot(self.time_series_data['date'], trend, 'r-', linewidth=2, label='趋势')
        ax3.plot(self.time_series_data['date'], seasonal + trend.mean(), 
                'g-', linewidth=1, alpha=0.7, label='季节性')
        
        ax3.set_title('时间序列分解')
        ax3.set_xlabel('日期')
        ax3.set_ylabel('温度')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        
        ax3.xaxis.set_major_formatter(DateFormatter('%Y-%m'))
        ax3.xaxis.set_major_locator(MonthLocator(interval=2))
        plt.setp(ax3.xaxis.get_majorticklabels(), rotation=45)
        
        # 4. 自相关分析
        ax4 = axes[1, 1]
        
        # 计算自相关
        from statsmodels.tsa.stattools import acf
        
        autocorr = acf(temp_data.dropna(), nlags=50)
        lags = range(len(autocorr))
        
        ax4.stem(lags, autocorr, basefmt=" ")
        ax4.axhline(y=0, color='black', linestyle='-', alpha=0.3)
        ax4.axhline(y=0.2, color='red', linestyle='--', alpha=0.5, label='显著性阈值')
        ax4.axhline(y=-0.2, color='red', linestyle='--', alpha=0.5)
        
        ax4.set_title('自相关函数')
        ax4.set_xlabel('滞后期')
        ax4.set_ylabel('自相关系数')
        ax4.legend()
        ax4.grid(True, alpha=0.3)
        
        # 5. 频谱分析
        ax5 = axes[2, 0]
        
        # FFT频谱分析
        from scipy.fft import fft, fftfreq
        
        temp_clean = temp_data.dropna()
        fft_values = fft(temp_clean)
        frequencies = fftfreq(len(temp_clean), d=1)  # 日频率
        
        # 只显示正频率部分
        positive_freq_idx = frequencies > 0
        
        ax5.loglog(frequencies[positive_freq_idx], 
                  np.abs(fft_values[positive_freq_idx])**2)
        ax5.set_title('功率谱密度')
        ax5.set_xlabel('频率 (1/天)')
        ax5.set_ylabel('功率')
        ax5.grid(True, alpha=0.3)
        
        # 标记年周期
        annual_freq = 1/365
        ax5.axvline(x=annual_freq, color='red', linestyle='--', 
                   label=f'年周期 ({annual_freq:.4f})')
        ax5.legend()
        
        # 6. 异常检测
        ax6 = axes[2, 1]
        
        # 使用Z-score检测异常值
        z_scores = np.abs(stats.zscore(temp_data.dropna()))
        threshold = 3
        anomalies = z_scores > threshold
        
        ax6.plot(self.time_series_data['date'], temp_data, 
                'b-', linewidth=1, alpha=0.7, label='正常数据')
        
        # 标记异常点
        anomaly_dates = self.time_series_data['date'][anomalies]
        anomaly_temps = temp_data[anomalies]
        
        ax6.scatter(anomaly_dates, anomaly_temps, 
                   color='red', s=50, zorder=5, label=f'异常值 ({len(anomaly_temps)}个)')
        
        ax6.set_title('异常值检测 (Z-score > 3)')
        ax6.set_xlabel('日期')
        ax6.set_ylabel('温度')
        ax6.legend()
        ax6.grid(True, alpha=0.3)
        
        ax6.xaxis.set_major_formatter(DateFormatter('%Y-%m'))
        ax6.xaxis.set_major_locator(MonthLocator(interval=2))
        plt.setp(ax6.xaxis.get_majorticklabels(), rotation=45)
        
        plt.tight_layout()
        plt.show()
    
    def statistical_analysis_demo(self):
        """统计分析演示"""
        fig, axes = plt.subplots(3, 3, figsize=(18, 15))
        
        # 1. 描述性统计
        ax1 = axes[0, 0]
        
        # 计算描述性统计
        spend_data = self.user_data['monthly_spend'].dropna()
        stats_summary = {
            '均值': spend_data.mean(),
            '中位数': spend_data.median(),
            '标准差': spend_data.std(),
            '偏度': stats.skew(spend_data),
            '峰度': stats.kurtosis(spend_data)
        }
        
        # 可视化统计量
        ax1.hist(spend_data, bins=50, alpha=0.7, color='skyblue', density=True)
        
        # 添加统计线
        ax1.axvline(stats_summary['均值'], color='red', linestyle='--', 
                   linewidth=2, label=f"均值: {stats_summary['均值']:.0f}")
        ax1.axvline(stats_summary['中位数'], color='green', linestyle='--', 
                   linewidth=2, label=f"中位数: {stats_summary['中位数']:.0f}")
        
        ax1.set_title('月消费分布与描述性统计')
        ax1.set_xlabel('月消费金额')
        ax1.set_ylabel('密度')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # 2. 相关性分析
        ax2 = axes[0, 1]
        
        # 计算相关矩阵
        numeric_data = self.user_data[['age', 'monthly_spend', 'session_duration', 'page_views']].dropna()
        correlation_matrix = numeric_data.corr()
        
        # 热力图
        im = ax2.imshow(correlation_matrix, cmap='coolwarm', vmin=-1, vmax=1)
        
        # 添加文本标签
        for i in range(len(correlation_matrix)):
            for j in range(len(correlation_matrix)):
                text = ax2.text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}',
                               ha="center", va="center", color="black", fontweight='bold')
        
        ax2.set_xticks(range(len(correlation_matrix)))
        ax2.set_yticks(range(len(correlation_matrix)))
        ax2.set_xticklabels(correlation_matrix.columns, rotation=45)
        ax2.set_yticklabels(correlation_matrix.columns)
        ax2.set_title('变量相关性矩阵')
        
        # 添加颜色条
        plt.colorbar(im, ax=ax2, shrink=0.8)
        
        # 3. 回归分析
        ax3 = axes[0, 2]
        
        # 年龄与消费的关系
        age_data = self.user_data['age']
        spend_clean = self.user_data['monthly_spend'].dropna()
        age_clean = age_data[spend_clean.index]
        
        # 散点图
        ax3.scatter(age_clean, spend_clean, alpha=0.5, s=20)
        
        # 拟合回归线
        slope, intercept, r_value, p_value, std_err = stats.linregress(age_clean, spend_clean)
        line_x = np.array([age_clean.min(), age_clean.max()])
        line_y = slope * line_x + intercept
        
        ax3.plot(line_x, line_y, 'r-', linewidth=2, 
                label=f'R² = {r_value**2:.3f}\np = {p_value:.3f}')
        
        ax3.set_title('年龄与月消费回归分析')
        ax3.set_xlabel('年龄')
        ax3.set_ylabel('月消费金额')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        
        # 4. 假设检验
        ax4 = axes[1, 0]
        
        # 性别对消费的影响 (t检验)
        male_spend = self.user_data[self.user_data['gender'] == '男']['monthly_spend'].dropna()
        female_spend = self.user_data[self.user_data['gender'] == '女']['monthly_spend'].dropna()
        
        # t检验
        t_stat, t_p_value = stats.ttest_ind(male_spend, female_spend)
        
        # 箱线图比较
        box_data = [male_spend, female_spend]
        box_plot = ax4.boxplot(box_data, labels=['男性', '女性'], patch_artist=True)
        box_plot['boxes'][0].set_facecolor('lightblue')
        box_plot['boxes'][1].set_facecolor('lightpink')
        
        ax4.set_title(f'性别消费差异检验\nt统计量: {t_stat:.3f}, p值: {t_p_value:.3f}')
        ax4.set_ylabel('月消费金额')
        ax4.grid(True, alpha=0.3)
        
        # 5. 方差分析 (ANOVA)
        ax5 = axes[1, 1]
        
        # 城市等级对消费的影响
        tier_groups = []
        tier_labels = []
        
        for tier in ['一线', '二线', '三线', '四线']:
            tier_data = self.user_data[self.user_data['city_tier'] == tier]['monthly_spend'].dropna()
            if len(tier_data) > 0:
                tier_groups.append(tier_data)
                tier_labels.append(tier)
        
        # ANOVA检验
        f_stat, anova_p_value = stats.f_oneway(*tier_groups)
        
        # 小提琴图
        parts = ax5.violinplot(tier_groups, positions=range(len(tier_groups)), 
                              showmeans=True, showmedians=True)
        
        for pc in parts['bodies']:
            pc.set_facecolor('lightgreen')
            pc.set_alpha(0.7)
        
        ax5.set_xticks(range(len(tier_labels)))
        ax5.set_xticklabels(tier_labels)
        ax5.set_title(f'城市等级消费差异 (ANOVA)\nF统计量: {f_stat:.3f}, p值: {anova_p_value:.3f}')
        ax5.set_ylabel('月消费金额')
        ax5.grid(True, alpha=0.3)
        
        # 6. 分布拟合
        ax6 = axes[1, 2]
        
        # 拟合不同分布
        session_data = self.user_data['session_duration'].dropna()
        
        # 原始数据直方图
        ax6.hist(session_data, bins=50, alpha=0.7, density=True, 
                color='lightblue', label='观测数据')
        
        # 拟合指数分布
        exp_params = stats.expon.fit(session_data)
        x_exp = np.linspace(0, session_data.max(), 100)
        ax6.plot(x_exp, stats.expon.pdf(x_exp, *exp_params), 
                'r-', linewidth=2, label='指数分布拟合')
        
        # 拟合伽马分布
        gamma_params = stats.gamma.fit(session_data)
        ax6.plot(x_exp, stats.gamma.pdf(x_exp, *gamma_params), 
                'g-', linewidth=2, label='伽马分布拟合')
        
        ax6.set_title('会话时长分布拟合')
        ax6.set_xlabel('会话时长 (分钟)')
        ax6.set_ylabel('密度')
        ax6.legend()
        ax6.grid(True, alpha=0.3)
        
        # 7. 聚类分析可视化
        ax7 = axes[2, 0]
        
        from sklearn.cluster import KMeans
        from sklearn.preprocessing import StandardScaler
        
        # 准备聚类数据
        cluster_data = self.user_data[['age', 'monthly_spend']].dropna()
        scaler = StandardScaler()
        scaled_cluster_data = scaler.fit_transform(cluster_data)
        
        # K-means聚类
        kmeans = KMeans(n_clusters=3, random_state=42)
        clusters = kmeans.fit_predict(scaled_cluster_data)
        
        # 可视化聚类结果
        colors = ['red', 'blue', 'green']
        for i in range(3):
            mask = clusters == i
            ax7.scatter(cluster_data.iloc[mask, 0], cluster_data.iloc[mask, 1], 
                       c=colors[i], alpha=0.6, s=20, label=f'聚类 {i+1}')
        
        # 聚类中心
        centers = scaler.inverse_transform(kmeans.cluster_centers_)
        ax7.scatter(centers[:, 0], centers[:, 1], 
                   c='black', marker='x', s=200, linewidths=3, label='聚类中心')
        
        ax7.set_title('用户聚类分析 (年龄 vs 消费)')
        ax7.set_xlabel('年龄')
        ax7.set_ylabel('月消费金额')
        ax7.legend()
        ax7.grid(True, alpha=0.3)
        
        # 8. 主成分分析 (PCA)
        ax8 = axes[2, 1]
        
        from sklearn.decomposition import PCA
        
        # PCA分析
        pca_data = self.user_data[['age', 'monthly_spend', 'session_duration', 'page_views']].dropna()
        scaler_pca = StandardScaler()
        scaled_pca_data = scaler_pca.fit_transform(pca_data)
        
        pca = PCA()
        pca_result = pca.fit_transform(scaled_pca_data)
        
        # 解释方差比例
        explained_variance = pca.explained_variance_ratio_
        cumulative_variance = np.cumsum(explained_variance)
        
        ax8.bar(range(1, len(explained_variance) + 1), explained_variance, 
               alpha=0.7, color='skyblue', label='单独解释方差')
        ax8.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 
                'ro-', linewidth=2, label='累积解释方差')
        
        ax8.set_title('主成分分析 - 解释方差')
        ax8.set_xlabel('主成分')
        ax8.set_ylabel('解释方差比例')
        ax8.legend()
        ax8.grid(True, alpha=0.3)
        
        # 9. 生存分析 (模拟)
        ax9 = axes[2, 2]
        
        # 模拟用户留存数据
        days = np.arange(1, 31)
        retention_rate = np.exp(-days / 15)  # 指数衰减
        retention_rate += np.random.normal(0, 0.02, len(days))  # 添加噪声
        retention_rate = np.clip(retention_rate, 0, 1)
        
        ax9.plot(days, retention_rate, 'b-', linewidth=2, marker='o', markersize=4)
        ax9.fill_between(days, retention_rate, alpha=0.3)
        
        # 添加关键节点
        key_days = [1, 7, 14, 30]
        for day in key_days:
            if day <= len(days):
                idx = day - 1
                ax9.axvline(x=day, color='red', linestyle='--', alpha=0.5)
                ax9.text(day, retention_rate[idx] + 0.05, f'{retention_rate[idx]:.1%}', 
                        ha='center', fontsize=9)
        
        ax9.set_title('用户留存率分析')
        ax9.set_xlabel('天数')
        ax9.set_ylabel('留存率')
        ax9.set_ylim(0, 1)
        ax9.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    def business_intelligence_demo(self):
        """商业智能分析演示"""
        fig = plt.figure(figsize=(20, 14))
        
        # 创建复杂的仪表板布局
        gs = fig.add_gridspec(4, 4, hspace=0.3, wspace=0.3)
        
        # 1. 销售趋势分析
        ax1 = fig.add_subplot(gs[0, :2])
        
        monthly_sales = self.sales_data.groupby('month')['sales'].sum()
        monthly_profit = self.sales_data.groupby('month')['profit'].sum()
        
        ax1_twin = ax1.twinx()
        
        line1 = ax1.plot(monthly_sales.index, monthly_sales.values, 
                        'b-', linewidth=3, marker='o', markersize=6, label='销售额')
        line2 = ax1_twin.plot(monthly_profit.index, monthly_profit.values, 
                             'r-', linewidth=3, marker='s', markersize=6, label='利润')
        
        ax1.set_title('月度销售趋势分析', fontsize=14, fontweight='bold')
        ax1.set_xlabel('月份')
        ax1.set_ylabel('销售额', color='blue')
        ax1_twin.set_ylabel('利润', color='red')
        ax1.tick_params(axis='y', labelcolor='blue')
        ax1_twin.tick_params(axis='y', labelcolor='red')
        
        # 合并图例
        lines = line1 + line2
        labels = [l.get_label() for l in lines]
        ax1.legend(lines, labels, loc='upper left')
        
        ax1.grid(True, alpha=0.3)
        
        # 2. 产品销售占比
        ax2 = fig.add_subplot(gs[0, 2])
        
        product_sales = self.sales_data.groupby('product')['sales'].sum()
        colors = plt.cm.Set3(np.linspace(0, 1, len(product_sales)))
        
        wedges, texts, autotexts = ax2.pie(product_sales.values, labels=product_sales.index, 
                                          autopct='%1.1f%%', colors=colors, startangle=90)
        ax2.set_title('产品销售占比', fontsize=12, fontweight='bold')
        
        # 3. 地区销售对比
        ax3 = fig.add_subplot(gs[0, 3])
        
        region_sales = self.sales_data.groupby('region')['sales'].sum().sort_values(ascending=True)
        colors_region = ['red' if x == region_sales.max() else 'lightblue' for x in region_sales.values]
        
        bars = ax3.barh(region_sales.index, region_sales.values, color=colors_region, alpha=0.8)
        ax3.set_title('地区销售对比', fontsize=12, fontweight='bold')
        ax3.set_xlabel('销售额')
        
        # 添加数值标签
        for bar, value in zip(bars, region_sales.values):
            ax3.text(bar.get_width() + value*0.01, bar.get_y() + bar.get_height()/2, 
                    f'{value:.0f}', va='center', fontsize=10)
        
        ax3.grid(True, alpha=0.3, axis='x')
        
        # 4. 销售热力图
        ax4 = fig.add_subplot(gs[1, :2])
        
        # 创建产品-地区销售矩阵
        pivot_data = self.sales_data.pivot_table(values='sales', index='product', 
                                                 columns='region', aggfunc='sum')
        
        im = ax4.imshow(pivot_data.values, cmap='YlOrRd', aspect='auto')
        
        # 设置刻度标签
        ax4.set_xticks(range(len(pivot_data.columns)))
        ax4.set_yticks(range(len(pivot_data.index)))
        ax4.set_xticklabels(pivot_data.columns)
        ax4.set_yticklabels(pivot_data.index)
        
        # 添加数值标签
        for i in range(len(pivot_data.index)):
            for j in range(len(pivot_data.columns)):
                text = ax4.text(j, i, f'{pivot_data.iloc[i, j]:.0f}',
                               ha="center", va="center", color="black", fontweight='bold')
        
        ax4.set_title('产品-地区销售热力图', fontsize=12, fontweight='bold')
        plt.colorbar(im, ax=ax4, shrink=0.8)
        
        # 5. 用户价值分析 (RFM)
        ax5 = fig.add_subplot(gs[1, 2:])
        
        # 模拟RFM数据
        np.random.seed(42)
        n_customers = 200
        
        # Recency (最近购买天数)
        recency = np.random.exponential(30, n_customers)
        # Frequency (购买频次)
        frequency = np.random.poisson(5, n_customers) + 1
        # Monetary (消费金额)
        monetary = np.random.lognormal(6, 1, n_customers)
        
        # RFM评分 (简化版)
        r_score = pd.qcut(recency, 5, labels=[5,4,3,2,1])  # 越近越好
        f_score = pd.qcut(frequency, 5, labels=[1,2,3,4,5])  # 越多越好
        m_score = pd.qcut(monetary, 5, labels=[1,2,3,4,5])  # 越高越好
        
        # 计算总分
        total_score = r_score.astype(int) + f_score.astype(int) + m_score.astype(int)
        
        # 3D散点图
        ax5 = fig.add_subplot(gs[1, 2:], projection='3d')
        
        scatter = ax5.scatter(recency, frequency, monetary, 
                             c=total_score, cmap='viridis', s=50, alpha=0.7)
        
        ax5.set_xlabel('Recency (天)')
        ax5.set_ylabel('Frequency (次)')
        ax5.set_zlabel('Monetary (元)')
        ax5.set_title('RFM客户价值分析', fontsize=12, fontweight='bold')
        
        plt.colorbar(scatter, ax=ax5, shrink=0.5, label='RFM总分')
        
        # 6. 股票技术分析
        ax6 = fig.add_subplot(gs[2, :])
        
        # 计算技术指标
        stock_data = self.stock_data.copy()
        
        # 移动平均线
        stock_data['MA5'] = stock_data['price'].rolling(window=5).mean()
        stock_data['MA20'] = stock_data['price'].rolling(window=20).mean()
        
        # 布林带
        stock_data['BB_upper'] = stock_data['MA20'] + 2 * stock_data['price'].rolling(window=20).std()
        stock_data['BB_lower'] = stock_data['MA20'] - 2 * stock_data['price'].rolling(window=20).std()
        
        # 绘制价格和技术指标
        ax6.plot(stock_data['date'], stock_data['price'], 'k-', linewidth=1, label='股价')
        ax6.plot(stock_data['date'], stock_data['MA5'], 'b-', linewidth=1, label='MA5')
        ax6.plot(stock_data['date'], stock_data['MA20'], 'r-', linewidth=1, label='MA20')
        
        # 布林带
        ax6.fill_between(stock_data['date'], stock_data['BB_upper'], stock_data['BB_lower'], 
                        alpha=0.2, color='gray', label='布林带')
        
        ax6.set_title('股票技术分析', fontsize=14, fontweight='bold')
        ax6.set_xlabel('日期')
        ax6.set_ylabel('价格')
        ax6.legend()
        ax6.grid(True, alpha=0.3)
        
        # 格式化日期轴
        ax6.xaxis.set_major_formatter(DateFormatter('%Y-%m'))
        ax6.xaxis.set_major_locator(MonthLocator(interval=1))
        plt.setp(ax6.xaxis.get_majorticklabels(), rotation=45)
        
        # 7. 关键指标仪表板
        ax7 = fig.add_subplot(gs[3, :2])
        
        # 计算关键指标
        kpis = {
            '总销售额': self.sales_data['sales'].sum(),
            '总利润': self.sales_data['profit'].sum(),
            '平均客单价': self.user_data['monthly_spend'].mean(),
            '活跃用户数': len(self.user_data),
            '用户留存率': 0.75  # 模拟值
        }
        
        # 目标值
        targets = {
            '总销售额': 250000,
            '总利润': 50000,
            '平均客单价': 800,
            '活跃用户数': 1200,
            '用户留存率': 0.80
        }
        
        # 计算完成率
        completion_rates = []
        kpi_names = []
        colors_kpi = []
        
        for kpi, value in kpis.items():
            target = targets[kpi]
            rate = value / target
            completion_rates.append(min(rate, 1.2))  # 限制最大值
            kpi_names.append(kpi)
            
            if rate >= 1.0:
                colors_kpi.append('green')
            elif rate >= 0.8:
                colors_kpi.append('orange')
            else:
                colors_kpi.append('red')
        
        bars = ax7.barh(kpi_names, completion_rates, color=colors_kpi, alpha=0.7)
        
        # 添加目标线
        ax7.axvline(x=1.0, color='black', linestyle='--', linewidth=2, label='目标线')
        
        # 添加数值标签
        for i, (bar, rate) in enumerate(zip(bars, completion_rates)):
            ax7.text(bar.get_width() + 0.02, bar.get_y() + bar.get_height()/2, 
                    f'{rate:.1%}', va='center', fontweight='bold')
        
        ax7.set_title('关键指标完成情况', fontsize=12, fontweight='bold')
        ax7.set_xlabel('目标完成率')
        ax7.set_xlim(0, 1.3)
        ax7.legend()
        ax7.grid(True, alpha=0.3, axis='x')
        
        # 8. 预测分析
        ax8 = fig.add_subplot(gs[3, 2:])
        
        # 简单的线性趋势预测
        from sklearn.linear_model import LinearRegression
        
        # 使用月度销售数据进行预测
        X = monthly_sales.index.values.reshape(-1, 1)
        y = monthly_sales.values
        
        model = LinearRegression()
        model.fit(X, y)
        
        # 预测未来3个月
        future_months = np.arange(13, 16).reshape(-1, 1)
        predictions = model.predict(future_months)
        
        # 绘制历史数据和预测
        ax8.plot(monthly_sales.index, monthly_sales.values, 
                'bo-', linewidth=2, markersize=6, label='历史数据')
        ax8.plot(future_months.flatten(), predictions, 
                'ro--', linewidth=2, markersize=6, label='预测数据')
        
        # 添加置信区间 (简化)
        prediction_std = np.std(y - model.predict(X))
        ax8.fill_between(future_months.flatten(), 
                        predictions - 1.96*prediction_std,
                        predictions + 1.96*prediction_std,
                        alpha=0.3, color='red', label='95%置信区间')
        
        ax8.set_title('销售预测分析', fontsize=12, fontweight='bold')
        ax8.set_xlabel('月份')
        ax8.set_ylabel('销售额')
        ax8.legend()
        ax8.grid(True, alpha=0.3)
        
        # 设置x轴刻度
        all_months = list(monthly_sales.index) + list(future_months.flatten())
        ax8.set_xticks(all_months)
        
        plt.show()

# 使用示例
data_demo = DataProcessingDemo()
data_demo.data_cleaning_demo()
data_demo.time_series_analysis_demo()
data_demo.statistical_analysis_demo()
data_demo.business_intelligence_demo()

6.2 Pandas集成

6.2.1 DataFrame可视化

class PandasIntegrationDemo:
    """Pandas集成演示类"""
    
    def __init__(self):
        plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 创建示例数据
        self.create_sample_dataframes()
    
    def create_sample_dataframes(self):
        """创建示例DataFrame"""
        # 1. 销售数据
        np.random.seed(42)
        dates = pd.date_range('2023-01-01', periods=365, freq='D')
        
        self.sales_df = pd.DataFrame({
            'date': dates,
            'product_A': np.random.normal(100, 20, 365).cumsum(),
            'product_B': np.random.normal(80, 15, 365).cumsum(),
            'product_C': np.random.normal(120, 25, 365).cumsum(),
            'region': np.random.choice(['北区', '南区', '东区', '西区'], 365),
            'sales_amount': np.random.lognormal(6, 0.5, 365)
        })
        
        # 2. 员工数据
        departments = ['技术部', '销售部', '市场部', '人事部', '财务部']
        self.employee_df = pd.DataFrame({
            'employee_id': range(1, 201),
            'department': np.random.choice(departments, 200),
            'salary': np.random.normal(8000, 2000, 200),
            'experience': np.random.randint(0, 15, 200),
            'performance': np.random.normal(85, 10, 200),
            'age': np.random.normal(32, 8, 200)
        })
        
        # 3. 股票数据
        self.stock_df = pd.DataFrame({
            'date': pd.date_range('2023-01-01', periods=252, freq='B'),
            'open': np.random.normal(100, 5, 252),
            'high': np.random.normal(105, 5, 252),
            'low': np.random.normal(95, 5, 252),
            'close': np.random.normal(100, 5, 252),
            'volume': np.random.lognormal(10, 0.5, 252)
        })
        
        # 确保high >= low, open/close在high/low之间
        for i in range(252):
            high = max(self.stock_df.loc[i, ['open', 'high', 'low', 'close']])
            low = min(self.stock_df.loc[i, ['open', 'high', 'low', 'close']])
            self.stock_df.loc[i, 'high'] = high
            self.stock_df.loc[i, 'low'] = low
    
    def basic_pandas_plotting_demo(self):
        """基础Pandas绘图演示"""
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        
        # 1. 线图
        ax1 = axes[0, 0]
        self.sales_df.set_index('date')[['product_A', 'product_B', 'product_C']].plot(ax=ax1)
        ax1.set_title('产品销售趋势 (DataFrame.plot())')
        ax1.set_ylabel('累计销售量')
        ax1.grid(True, alpha=0.3)
        
        # 2. 柱状图
        ax2 = axes[0, 1]
        dept_salary = self.employee_df.groupby('department')['salary'].mean()
        dept_salary.plot(kind='bar', ax=ax2, color='skyblue', alpha=0.8)
        ax2.set_title('各部门平均薪资')
        ax2.set_ylabel('平均薪资')
        ax2.tick_params(axis='x', rotation=45)
        ax2.grid(True, alpha=0.3)
        
        # 3. 散点图
        ax3 = axes[0, 2]
        self.employee_df.plot.scatter(x='experience', y='salary', 
                                     c='performance', cmap='viridis', 
                                     ax=ax3, alpha=0.7)
        ax3.set_title('经验 vs 薪资 (按绩效着色)')
        ax3.grid(True, alpha=0.3)
        
        # 4. 直方图
        ax4 = axes[1, 0]
        self.employee_df['age'].plot.hist(bins=20, ax=ax4, alpha=0.7, color='orange')
        ax4.set_title('员工年龄分布')
        ax4.set_xlabel('年龄')
        ax4.set_ylabel('频次')
        ax4.grid(True, alpha=0.3)
        
        # 5. 箱线图
        ax5 = axes[1, 1]
        self.employee_df.boxplot(column='salary', by='department', ax=ax5)
        ax5.set_title('各部门薪资分布')
        ax5.set_xlabel('部门')
        ax5.set_ylabel('薪资')
        plt.setp(ax5.xaxis.get_majorticklabels(), rotation=45)
        
        # 6. 面积图
        ax6 = axes[1, 2]
        monthly_sales = self.sales_df.set_index('date').resample('M')[['product_A', 'product_B', 'product_C']].sum()
        monthly_sales.plot.area(ax=ax6, alpha=0.7)
        ax6.set_title('月度产品销售堆叠面积图')
        ax6.set_ylabel('销售量')
        ax6.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    def advanced_pandas_plotting_demo(self):
        """高级Pandas绘图演示"""
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        
        # 1. 多子图绘制
        ax1 = axes[0, 0]
        
        # 使用subplot参数
        self.sales_df.set_index('date')[['product_A', 'product_B', 'product_C']].plot(
            subplots=True, ax=ax1, layout=(3, 1), figsize=(6, 8), 
            title='产品销售分别显示')
        
        # 2. 双轴图
        ax2 = axes[0, 1]
        
        # 主轴
        sales_monthly = self.sales_df.set_index('date')['sales_amount'].resample('M').sum()
        line1 = ax2.plot(sales_monthly.index, sales_monthly.values, 
                        'b-', linewidth=2, label='销售额')
        ax2.set_ylabel('销售额', color='blue')
        ax2.tick_params(axis='y', labelcolor='blue')
        
        # 次轴
        ax2_twin = ax2.twinx()
        avg_monthly = self.sales_df.set_index('date')['sales_amount'].resample('M').mean()
        line2 = ax2_twin.plot(avg_monthly.index, avg_monthly.values, 
                             'r-', linewidth=2, label='平均值')
        ax2_twin.set_ylabel('平均销售额', color='red')
        ax2_twin.tick_params(axis='y', labelcolor='red')
        
        ax2.set_title('销售额双轴图')
        
        # 合并图例
        lines = line1 + line2
        labels = [l.get_label() for l in lines]
        ax2.legend(lines, labels, loc='upper left')
        ax2.grid(True, alpha=0.3)
        
        # 3. 分组绘图
        ax3 = axes[0, 2]
        
        # 按部门分组的薪资分布
        for dept in self.employee_df['department'].unique():
            dept_data = self.employee_df[self.employee_df['department'] == dept]
            ax3.scatter(dept_data['experience'], dept_data['salary'], 
                       label=dept, alpha=0.7, s=30)
        
        ax3.set_title('各部门经验-薪资关系')
        ax3.set_xlabel('工作经验')
        ax3.set_ylabel('薪资')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        
        # 4. 热力图 (使用pivot)
        ax4 = axes[1, 0]
        
        # 创建透视表
        pivot_table = self.sales_df.pivot_table(
            values='sales_amount', 
            index=self.sales_df['date'].dt.month,
            columns='region',
            aggfunc='mean'
        )
        
        im = ax4.imshow(pivot_table.values, cmap='YlOrRd', aspect='auto')
        
        # 设置标签
        ax4.set_xticks(range(len(pivot_table.columns)))
        ax4.set_yticks(range(len(pivot_table.index)))
        ax4.set_xticklabels(pivot_table.columns)
        ax4.set_yticklabels([f'{m}月' for m in pivot_table.index])
        
        ax4.set_title('月度-地区销售热力图')
        plt.colorbar(im, ax=ax4)
        
        # 5. 时间序列重采样
        ax5 = axes[1, 1]
        
        # 不同频率的重采样
        daily_avg = self.sales_df.set_index('date')['sales_amount']
        weekly_avg = daily_avg.resample('W').mean()
        monthly_avg = daily_avg.resample('M').mean()
        
        ax5.plot(daily_avg.index, daily_avg.values, 
                alpha=0.3, color='lightgray', label='日数据')
        ax5.plot(weekly_avg.index, weekly_avg.values, 
                'b-', linewidth=2, label='周平均')
        ax5.plot(monthly_avg.index, monthly_avg.values, 
                'r-', linewidth=3, label='月平均')
        
        ax5.set_title('不同频率重采样对比')
        ax5.set_ylabel('销售额')
        ax5.legend()
        ax5.grid(True, alpha=0.3)
        
        # 6. 滚动统计
        ax6 = axes[1, 2]
        
        # 滚动均值和标准差
        rolling_mean = daily_avg.rolling(window=30).mean()
        rolling_std = daily_avg.rolling(window=30).std()
        
        ax6.plot(daily_avg.index, daily_avg.values, 
                alpha=0.3, color='lightblue', label='原始数据')
        ax6.plot(rolling_mean.index, rolling_mean.values, 
                'b-', linewidth=2, label='30日滚动均值')
        
        # 置信带
        ax6.fill_between(rolling_mean.index, 
                        rolling_mean - 2*rolling_std,
                        rolling_mean + 2*rolling_std,
                        alpha=0.2, color='blue', label='±2σ置信带')
        
        ax6.set_title('滚动统计分析')
        ax6.set_ylabel('销售额')
        ax6.legend()
        ax6.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    def pandas_styling_demo(self):
        """Pandas样式化演示"""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. 自定义颜色映射
        ax1 = axes[0, 0]
        
        # 按绩效分组的薪资分布
        performance_groups = pd.cut(self.employee_df['performance'], 
                                   bins=[0, 70, 80, 90, 100], 
                                   labels=['差', '一般', '良好', '优秀'])
        
        colors = {'差': 'red', '一般': 'orange', '良好': 'lightblue', '优秀': 'green'}
        
        for group in performance_groups.cat.categories:
            mask = performance_groups == group
            group_data = self.employee_df[mask]
            ax1.scatter(group_data['experience'], group_data['salary'], 
                       c=colors[group], label=group, alpha=0.7, s=50)
        
        ax1.set_title('绩效分组薪资分布')
        ax1.set_xlabel('工作经验')
        ax1.set_ylabel('薪资')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # 2. 多级索引可视化
        ax2 = axes[0, 1]
        
        # 创建多级索引数据
        multi_index_data = self.employee_df.groupby(['department', 'age' // 10 * 10]).agg({
            'salary': 'mean',
            'performance': 'mean'
        }).round(2)
        
        # 重置索引以便绘图
        plot_data = multi_index_data.reset_index()
        plot_data['age_group'] = plot_data['age'].astype(str) + '岁组'
        
        # 分组柱状图
        departments = plot_data['department'].unique()
        x = np.arange(len(departments))
        width = 0.35
        
        age_groups = plot_data['age_group'].unique()
        for i, age_group in enumerate(age_groups):
            group_data = plot_data[plot_data['age_group'] == age_group]
            values = [group_data[group_data['department'] == dept]['salary'].iloc[0] 
                     if len(group_data[group_data['department'] == dept]) > 0 else 0 
                     for dept in departments]
            ax2.bar(x + i*width, values, width, label=age_group, alpha=0.8)
        
        ax2.set_title('部门-年龄组薪资对比')
        ax2.set_xlabel('部门')
        ax2.set_ylabel('平均薪资')
        ax2.set_xticks(x + width/2)
        ax2.set_xticklabels(departments, rotation=45)
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # 3. 时间序列样式
        ax3 = axes[1, 0]
        
        # 季节性分析
        seasonal_data = self.sales_df.copy()
        seasonal_data['month'] = seasonal_data['date'].dt.month
        seasonal_data['season'] = seasonal_data['month'].map({
            12: '冬', 1: '冬', 2: '冬',
            3: '春', 4: '春', 5: '春',
            6: '夏', 7: '夏', 8: '夏',
            9: '秋', 10: '秋', 11: '秋'
        })
        
        season_colors = {'春': 'green', '夏': 'red', '秋': 'orange', '冬': 'blue'}
        
        for season in ['春', '夏', '秋', '冬']:
            season_data = seasonal_data[seasonal_data['season'] == season]
            ax3.scatter(season_data['date'], season_data['sales_amount'], 
                       c=season_colors[season], label=season, alpha=0.6, s=20)
        
        ax3.set_title('季节性销售分布')
        ax3.set_xlabel('日期')
        ax3.set_ylabel('销售额')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        
        # 4. 相关性矩阵可视化
        ax4 = axes[1, 1]
        
        # 计算相关性矩阵
        numeric_cols = ['salary', 'experience', 'performance', 'age']
        corr_matrix = self.employee_df[numeric_cols].corr()
        
        # 创建掩码矩阵(只显示下三角)
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        
        # 绘制热力图
        im = ax4.imshow(corr_matrix.where(~mask), cmap='RdBu_r', 
                       vmin=-1, vmax=1, aspect='auto')
        
        # 添加文本标签
        for i in range(len(corr_matrix)):
            for j in range(len(corr_matrix)):
                if not mask[i, j]:
                    text = ax4.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}',
                                   ha="center", va="center", 
                                   color="white" if abs(corr_matrix.iloc[i, j]) > 0.5 else "black",
                                   fontweight='bold')
        
        ax4.set_xticks(range(len(corr_matrix)))
        ax4.set_yticks(range(len(corr_matrix)))
        ax4.set_xticklabels(corr_matrix.columns)
        ax4.set_yticklabels(corr_matrix.columns)
        ax4.set_title('员工数据相关性矩阵')
        
        plt.colorbar(im, ax=ax4, shrink=0.8)
        
        plt.tight_layout()
        plt.show()

# 使用示例
pandas_demo = PandasIntegrationDemo()
pandas_demo.basic_pandas_plotting_demo()
pandas_demo.advanced_pandas_plotting_demo()
pandas_demo.pandas_styling_demo()

6.3 数据可视化最佳实践

6.3.1 图表选择指南

class VisualizationBestPractices:
    """数据可视化最佳实践演示类"""
    
    def __init__(self):
        plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 生成示例数据
        self.generate_practice_data()
    
    def generate_practice_data(self):
        """生成实践数据"""
        np.random.seed(42)
        
        # 1. 分类数据
        self.category_data = pd.DataFrame({
            'category': ['A', 'B', 'C', 'D', 'E'],
            'values': [23, 45, 56, 78, 32],
            'subcategory': ['A1', 'B1', 'C1', 'D1', 'E1']
        })
        
        # 2. 时间序列数据
        dates = pd.date_range('2023-01-01', periods=100, freq='D')
        self.time_data = pd.DataFrame({
            'date': dates,
            'metric1': np.cumsum(np.random.randn(100)) + 100,
            'metric2': np.cumsum(np.random.randn(100)) + 50,
            'metric3': np.random.normal(75, 10, 100)
        })
        
        # 3. 分布数据
        self.distribution_data = {
            'normal': np.random.normal(50, 15, 1000),
            'skewed': np.random.exponential(2, 1000),
            'bimodal': np.concatenate([np.random.normal(30, 5, 500), 
                                     np.random.normal(70, 5, 500)])
        }
        
        # 4. 关系数据
        n = 200
        self.relationship_data = pd.DataFrame({
            'x': np.random.randn(n),
            'y': np.random.randn(n),
            'size': np.random.uniform(20, 200, n),
            'category': np.random.choice(['Type1', 'Type2', 'Type3'], n)
        })
        
        # 添加相关性
        self.relationship_data['y'] = (0.7 * self.relationship_data['x'] + 
                                      0.3 * self.relationship_data['y'])
    
    def chart_selection_guide_demo(self):
        """图表选择指南演示"""
        fig, axes = plt.subplots(3, 3, figsize=(18, 15))
        
        # 1. 比较类别 - 柱状图
        ax1 = axes[0, 0]
        bars = ax1.bar(self.category_data['category'], self.category_data['values'], 
                      color='steelblue', alpha=0.8)
        ax1.set_title('类别比较 - 柱状图\n适用:比较不同类别的数值')
        ax1.set_ylabel('数值')
        
        # 添加数值标签
        for bar, value in zip(bars, self.category_data['values']):
            ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
                    str(value), ha='center', va='bottom')
        
        ax1.grid(True, alpha=0.3, axis='y')
        
        # 2. 显示占比 - 饼图
        ax2 = axes[0, 1]
        colors = plt.cm.Set3(np.linspace(0, 1, len(self.category_data)))
        wedges, texts, autotexts = ax2.pie(self.category_data['values'], 
                                          labels=self.category_data['category'],
                                          autopct='%1.1f%%', colors=colors,
                                          startangle=90)
        ax2.set_title('占比显示 - 饼图\n适用:显示部分与整体的关系')
        
        # 3. 趋势变化 - 线图
        ax3 = axes[0, 2]
        ax3.plot(self.time_data['date'], self.time_data['metric1'], 
                'b-', linewidth=2, label='指标1')
        ax3.plot(self.time_data['date'], self.time_data['metric2'], 
                'r-', linewidth=2, label='指标2')
        ax3.set_title('趋势变化 - 线图\n适用:显示数据随时间的变化')
        ax3.set_ylabel('数值')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        
        # 格式化日期
        ax3.tick_params(axis='x', rotation=45)
        
        # 4. 分布形状 - 直方图
        ax4 = axes[1, 0]
        ax4.hist(self.distribution_data['normal'], bins=30, alpha=0.7, 
                color='skyblue', density=True)
        ax4.set_title('分布形状 - 直方图\n适用:显示数据的分布特征')
        ax4.set_xlabel('数值')
        ax4.set_ylabel('密度')
        ax4.grid(True, alpha=0.3)
        
        # 5. 多组分布比较 - 箱线图
        ax5 = axes[1, 1]
        box_data = [self.distribution_data['normal'], 
                   self.distribution_data['skewed'],
                   self.distribution_data['bimodal']]
        box_plot = ax5.boxplot(box_data, labels=['正态', '偏态', '双峰'], 
                              patch_artist=True)
        
        colors_box = ['lightblue', 'lightgreen', 'lightcoral']
        for patch, color in zip(box_plot['boxes'], colors_box):
            patch.set_facecolor(color)
        
        ax5.set_title('多组分布比较 - 箱线图\n适用:比较多组数据的分布')
        ax5.set_ylabel('数值')
        ax5.grid(True, alpha=0.3)
        
        # 6. 相关关系 - 散点图
        ax6 = axes[1, 2]
        colors_scatter = {'Type1': 'red', 'Type2': 'blue', 'Type3': 'green'}
        
        for cat in self.relationship_data['category'].unique():
            cat_data = self.relationship_data[self.relationship_data['category'] == cat]
            ax6.scatter(cat_data['x'], cat_data['y'], 
                       c=colors_scatter[cat], label=cat, alpha=0.7, s=30)
        
        ax6.set_title('相关关系 - 散点图\n适用:显示两个变量的关系')
        ax6.set_xlabel('变量X')
        ax6.set_ylabel('变量Y')
        ax6.legend()
        ax6.grid(True, alpha=0.3)
        
        # 7. 层次结构 - 树状图(简化版)
        ax7 = axes[2, 0]
        
        # 创建简单的层次数据
        hierarchy_data = {
            '总部': {'销售部': 45, '技术部': 38, '市场部': 25},
            '分部A': {'销售A': 20, '技术A': 15},
            '分部B': {'销售B': 18, '技术B': 12}
        }
        
        # 简化的树状图表示
        y_pos = 0.8
        colors_tree = ['lightblue', 'lightgreen', 'lightcoral']
        
        for i, (parent, children) in enumerate(hierarchy_data.items()):
            ax7.text(0.1, y_pos, parent, fontsize=12, fontweight='bold',
                    bbox=dict(boxstyle='round', facecolor=colors_tree[i], alpha=0.7))
            
            x_child = 0.4
            for child, value in children.items():
                ax7.text(x_child, y_pos, f'{child}\n({value})', fontsize=10,
                        bbox=dict(boxstyle='round', facecolor='white', alpha=0.7))
                # 连接线
                ax7.plot([0.25, x_child-0.05], [y_pos, y_pos], 'k-', alpha=0.5)
                x_child += 0.15
            
            y_pos -= 0.3
        
        ax7.set_xlim(0, 1)
        ax7.set_ylim(0, 1)
        ax7.set_title('层次结构 - 树状图\n适用:显示层次关系')
        ax7.axis('off')
        
        # 8. 地理分布 - 气泡图(模拟)
        ax8 = axes[2, 1]
        
        # 模拟城市数据
        cities = ['北京', '上海', '广州', '深圳', '杭州', '成都']
        x_coords = [116.4, 121.5, 113.3, 114.1, 120.2, 104.1]  # 经度
        y_coords = [39.9, 31.2, 23.1, 22.5, 30.3, 30.7]       # 纬度
        populations = [2154, 2424, 1491, 1344, 981, 1658]      # 人口(万)
        
        # 归一化坐标到图表范围
        x_norm = [(x - min(x_coords)) / (max(x_coords) - min(x_coords)) for x in x_coords]
        y_norm = [(y - min(y_coords)) / (max(y_coords) - min(y_coords)) for y in y_coords]
        
        scatter = ax8.scatter(x_norm, y_norm, s=[p/5 for p in populations], 
                             c=populations, cmap='viridis', alpha=0.7)
        
        # 添加城市标签
        for i, city in enumerate(cities):
            ax8.annotate(city, (x_norm[i], y_norm[i]), 
                        xytext=(5, 5), textcoords='offset points', fontsize=9)
        
        ax8.set_title('地理分布 - 气泡图\n适用:显示地理位置和数值大小')
        ax8.set_xlabel('经度 (归一化)')
        ax8.set_ylabel('纬度 (归一化)')
        plt.colorbar(scatter, ax=ax8, label='人口(万)')
        
        # 9. 多维数据 - 雷达图
        ax9 = axes[2, 2]
        
        # 雷达图数据
        categories = ['技能A', '技能B', '技能C', '技能D', '技能E']
        person1 = [4, 3, 5, 4, 3]
        person2 = [3, 5, 3, 4, 5]
        
        # 计算角度
        angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False).tolist()
        angles += angles[:1]  # 闭合图形
        
        person1 += person1[:1]
        person2 += person2[:1]
        
        ax9 = plt.subplot(3, 3, 9, projection='polar')
        ax9.plot(angles, person1, 'o-', linewidth=2, label='人员1', color='blue')
        ax9.fill(angles, person1, alpha=0.25, color='blue')
        ax9.plot(angles, person2, 'o-', linewidth=2, label='人员2', color='red')
        ax9.fill(angles, person2, alpha=0.25, color='red')
        
        ax9.set_xticks(angles[:-1])
        ax9.set_xticklabels(categories)
        ax9.set_ylim(0, 5)
        ax9.set_title('多维数据 - 雷达图\n适用:比较多个维度的表现', y=1.08)
        ax9.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
        
        plt.tight_layout()
        plt.show()

# 使用示例
best_practices = VisualizationBestPractices()
best_practices.chart_selection_guide_demo()

6.3.2 颜色设计原则

class ColorDesignPrinciples:
    """颜色设计原则演示类"""
    
    def __init__(self):
        plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 生成示例数据
        np.random.seed(42)
        self.data = np.random.randn(100, 5).cumsum(axis=0)
        self.categories = ['类别A', '类别B', '类别C', '类别D', '类别E']
    
    def color_accessibility_demo(self):
        """颜色无障碍设计演示"""
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        
        # 1. 色盲友好的颜色方案
        ax1 = axes[0, 0]
        
        # 色盲友好的颜色
        colorblind_friendly = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
        
        for i, (category, color) in enumerate(zip(self.categories, colorblind_friendly)):
            ax1.plot(self.data[:, i], color=color, linewidth=3, label=category)
        
        ax1.set_title('色盲友好颜色方案\n使用高对比度和不同亮度')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # 2. 不推荐的颜色组合
        ax2 = axes[0, 1]
        
        # 红绿色盲难以区分的颜色
        bad_colors = ['#ff0000', '#00ff00', '#ffff00', '#ff00ff', '#00ffff']
        
        for i, (category, color) in enumerate(zip(self.categories, bad_colors)):
            ax2.plot(self.data[:, i], color=color, linewidth=3, label=category)
        
        ax2.set_title('不推荐的颜色组合\n红绿色盲难以区分')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # 3. 单色渐变方案
        ax3 = axes[0, 2]
        
        # 使用单一色调的不同深浅
        blues = plt.cm.Blues(np.linspace(0.3, 1.0, 5))
        
        for i, (category, color) in enumerate(zip(self.categories, blues)):
            ax3.plot(self.data[:, i], color=color, linewidth=3, label=category)
        
        ax3.set_title('单色渐变方案\n使用蓝色的不同深浅')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        
        # 4. 分类数据的颜色选择
        ax4 = axes[1, 0]
        
        # 使用定性颜色映射
        qualitative_colors = plt.cm.Set1(np.linspace(0, 1, 5))
        
        values = [23, 45, 56, 78, 32]
        bars = ax4.bar(self.categories, values, color=qualitative_colors, alpha=0.8)
        
        # 添加数值标签
        for bar, value in zip(bars, values):
            ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
                    str(value), ha='center', va='bottom', fontweight='bold')
        
        ax4.set_title('分类数据颜色选择\n使用定性颜色映射')
        ax4.set_ylabel('数值')
        ax4.grid(True, alpha=0.3, axis='y')
        
        # 5. 连续数据的颜色选择
        ax5 = axes[1, 1]
        
        # 创建热力图数据
        heatmap_data = np.random.randn(10, 10).cumsum(axis=1)
        
        # 使用连续颜色映射
        im = ax5.imshow(heatmap_data, cmap='viridis', aspect='auto')
        ax5.set_title('连续数据颜色选择\n使用Viridis颜色映射')
        plt.colorbar(im, ax=ax5, shrink=0.8)
        
        # 6. 发散数据的颜色选择
        ax6 = axes[1, 2]
        
        # 创建发散数据(以0为中心)
        diverging_data = np.random.randn(10, 10) * 2
        
        # 使用发散颜色映射
        im2 = ax6.imshow(diverging_data, cmap='RdBu_r', vmin=-4, vmax=4, aspect='auto')
        ax6.set_title('发散数据颜色选择\n使用红蓝发散颜色映射')
        plt.colorbar(im2, ax=ax6, shrink=0.8)
        
        plt.tight_layout()
        plt.show()
    
    def color_psychology_demo(self):
        """颜色心理学应用演示"""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. 情感表达 - 温暖vs冷色调
        ax1 = axes[0, 0]
        
        # 温暖色调数据(销售增长)
        warm_data = np.array([10, 15, 22, 28, 35, 42])
        warm_colors = ['#ff6b6b', '#ff8e53', '#ff6b9d', '#c44569', '#f8b500', '#feca57']
        
        ax1.bar(range(len(warm_data)), warm_data, color=warm_colors[:len(warm_data)], alpha=0.8)
        ax1.set_title('温暖色调 - 积极增长趋势\n传达活力和成功')
        ax1.set_ylabel('销售额增长(%)')
        ax1.set_xlabel('月份')
        ax1.grid(True, alpha=0.3, axis='y')
        
        # 2. 冷色调数据(成本控制)
        ax2 = axes[0, 1]
        
        # 冷色调数据(成本下降)
        cool_data = np.array([100, 95, 88, 82, 75, 70])
        cool_colors = ['#74b9ff', '#0984e3', '#00b894', '#00cec9', '#6c5ce7', '#a29bfe']
        
        ax2.plot(range(len(cool_data)), cool_data, color='#74b9ff', linewidth=3, marker='o', markersize=8)
        ax2.fill_between(range(len(cool_data)), cool_data, alpha=0.3, color='#74b9ff')
        ax2.set_title('冷色调 - 成本控制趋势\n传达稳定和可靠')
        ax2.set_ylabel('运营成本')
        ax2.set_xlabel('月份')
        ax2.grid(True, alpha=0.3)
        
        # 3. 警告和危险 - 红色系
        ax3 = axes[1, 0]
        
        # 风险数据
        risk_levels = ['低', '中', '高', '极高']
        risk_values = [15, 35, 65, 90]
        risk_colors = ['#2ecc71', '#f39c12', '#e74c3c', '#c0392b']
        
        bars = ax3.bar(risk_levels, risk_values, color=risk_colors, alpha=0.8)
        
        # 添加警告标识
        for i, (bar, value) in enumerate(zip(bars, risk_values)):
            if value > 50:
                ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, 
                        '⚠️', ha='center', va='bottom', fontsize=16)
            ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height()/2, 
                    f'{value}%', ha='center', va='center', fontweight='bold', color='white')
        
        ax3.set_title('风险等级可视化\n使用红色系表达危险程度')
        ax3.set_ylabel('风险指数')
        ax3.grid(True, alpha=0.3, axis='y')
        
        # 4. 品牌色彩应用
        ax4 = axes[1, 1]
        
        # 模拟不同品牌的市场份额
        brands = ['品牌A', '品牌B', '品牌C', '品牌D', '品牌E']
        market_share = [35, 25, 20, 12, 8]
        
        # 品牌特色颜色
        brand_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
        
        # 创建饼图
        wedges, texts, autotexts = ax4.pie(market_share, labels=brands, 
                                          colors=brand_colors, autopct='%1.1f%%',
                                          startangle=90, explode=(0.05, 0, 0, 0, 0))
        
        # 突出显示最大份额
        wedges[0].set_edgecolor('black')
        wedges[0].set_linewidth(2)
        
        ax4.set_title('品牌市场份额\n使用品牌特色颜色')
        
        plt.tight_layout()
        plt.show()

# 使用示例
color_design = ColorDesignPrinciples()
color_design.color_accessibility_demo()
color_design.color_psychology_demo()

6.3.3 交互式可视化基础

class InteractiveVisualization:
    """交互式可视化基础演示类"""
    
    def __init__(self):
        plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
        plt.rcParams['axes.unicode_minus'] = False
        
        # 生成示例数据
        self.generate_interactive_data()
    
    def generate_interactive_data(self):
        """生成交互式演示数据"""
        np.random.seed(42)
        
        # 时间序列数据
        self.dates = pd.date_range('2023-01-01', periods=365, freq='D')
        self.ts_data = pd.DataFrame({
            'date': self.dates,
            'sales': np.cumsum(np.random.randn(365)) + 1000,
            'profit': np.cumsum(np.random.randn(365)) + 500,
            'customers': np.cumsum(np.random.randn(365)) + 200
        })
        
        # 散点图数据
        n = 500
        self.scatter_data = pd.DataFrame({
            'x': np.random.randn(n),
            'y': np.random.randn(n),
            'size': np.random.uniform(20, 200, n),
            'category': np.random.choice(['A', 'B', 'C', 'D'], n),
            'value': np.random.uniform(0, 100, n)
        })
    
    def mouse_interaction_demo(self):
        """鼠标交互演示"""
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        
        # 1. 点击高亮功能
        ax1.set_title('点击数据点查看详情\n(模拟交互效果)')
        
        # 绘制散点图
        colors = {'A': 'red', 'B': 'blue', 'C': 'green', 'D': 'orange'}
        
        for category in self.scatter_data['category'].unique():
            cat_data = self.scatter_data[self.scatter_data['category'] == category]
            scatter = ax1.scatter(cat_data['x'], cat_data['y'], 
                                c=colors[category], label=category, 
                                s=cat_data['size']/5, alpha=0.7)
        
        # 模拟选中的点(高亮显示)
        selected_point = self.scatter_data.iloc[50]
        ax1.scatter(selected_point['x'], selected_point['y'], 
                   s=200, facecolors='none', edgecolors='black', linewidth=3)
        
        # 添加信息框(模拟tooltip)
        ax1.annotate(f'类别: {selected_point["category"]}\n'
                    f'数值: {selected_point["value"]:.1f}\n'
                    f'大小: {selected_point["size"]:.1f}',
                    xy=(selected_point['x'], selected_point['y']),
                    xytext=(20, 20), textcoords='offset points',
                    bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.8),
                    arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))
        
        ax1.set_xlabel('X 坐标')
        ax1.set_ylabel('Y 坐标')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # 2. 缩放和平移效果(模拟)
        ax2.set_title('缩放区域详细视图\n(模拟交互效果)')
        
        # 原始数据的子集(模拟缩放效果)
        zoom_data = self.scatter_data[
            (self.scatter_data['x'] > -1) & (self.scatter_data['x'] < 1) &
            (self.scatter_data['y'] > -1) & (self.scatter_data['y'] < 1)
        ]
        
        for category in zoom_data['category'].unique():
            cat_data = zoom_data[zoom_data['category'] == category]
            if len(cat_data) > 0:
                ax2.scatter(cat_data['x'], cat_data['y'], 
                           c=colors[category], label=category, 
                           s=cat_data['size']/3, alpha=0.8)
        
        # 添加缩放框
        from matplotlib.patches import Rectangle
        zoom_rect = Rectangle((-1, -1), 2, 2, linewidth=2, 
                             edgecolor='red', facecolor='none', linestyle='--')
        ax1.add_patch(zoom_rect)
        
        ax2.set_xlim(-1, 1)
        ax2.set_ylim(-1, 1)
        ax2.set_xlabel('X 坐标 (缩放)')
        ax2.set_ylabel('Y 坐标 (缩放)')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    def animation_basics_demo(self):
        """动画基础演示"""
        from matplotlib.animation import FuncAnimation
        
        # 创建动画数据
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        
        # 1. 静态展示动画的几个关键帧
        ax1.set_title('动画关键帧展示\n(时间序列数据逐步显示)')
        
        # 显示不同时间点的数据
        time_points = [50, 100, 200, 365]
        colors_time = ['lightblue', 'blue', 'darkblue', 'navy']
        
        for i, (time_point, color) in enumerate(zip(time_points, colors_time)):
            data_subset = self.ts_data.iloc[:time_point]
            ax1.plot(data_subset['date'], data_subset['sales'], 
                    color=color, linewidth=2, alpha=0.7,
                    label=f'第{time_point}天')
        
        ax1.set_xlabel('日期')
        ax1.set_ylabel('销售额')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # 2. 动态散点图(模拟气泡变化)
        ax2.set_title('动态气泡图\n(模拟大小变化动画)')
        
        # 创建多个时间步的气泡大小
        base_sizes = self.scatter_data['size'].values
        
        # 显示不同时间步的气泡
        time_steps = [0.5, 1.0, 1.5, 2.0]
        alphas = [0.3, 0.5, 0.7, 1.0]
        
        for i, (multiplier, alpha) in enumerate(zip(time_steps, alphas)):
            current_sizes = base_sizes * multiplier
            
            for category in self.scatter_data['category'].unique():
                cat_data = self.scatter_data[self.scatter_data['category'] == category]
                cat_sizes = current_sizes[self.scatter_data['category'] == category]
                
                ax2.scatter(cat_data['x'], cat_data['y'], 
                           s=cat_sizes/10, alpha=alpha*0.3, 
                           c=colors[category], 
                           label=f'{category} (t={multiplier})' if i == len(time_steps)-1 else "")
        
        ax2.set_xlabel('X 坐标')
        ax2.set_ylabel('Y 坐标')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # 3. 简单的实际动画示例(注释掉以避免在静态环境中执行)
        """
        # 实际动画代码示例(需要在支持动画的环境中运行)
        def animate_line(frame):
            ax.clear()
            data_subset = self.ts_data.iloc[:frame*10]
            ax.plot(data_subset['date'], data_subset['sales'])
            ax.set_title(f'销售趋势动画 - 第{frame*10}天')
            ax.grid(True, alpha=0.3)
        
        fig, ax = plt.subplots(figsize=(10, 6))
        anim = FuncAnimation(fig, animate_line, frames=36, interval=200, repeat=True)
        # anim.save('sales_animation.gif', writer='pillow', fps=5)
        """
    
    def widget_simulation_demo(self):
        """控件模拟演示"""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. 滑块控制效果(模拟不同参数值)
        ax1 = axes[0, 0]
        ax1.set_title('滑块控制效果模拟\n(不同平滑参数)')
        
        # 原始数据
        x = np.linspace(0, 10, 100)
        y_original = np.sin(x) + 0.3 * np.random.randn(100)
        
        # 不同平滑程度
        smoothing_levels = [1, 5, 10, 20]
        colors_smooth = ['red', 'orange', 'blue', 'green']
        
        ax1.plot(x, y_original, 'k.', alpha=0.3, label='原始数据')
        
        for smooth_level, color in zip(smoothing_levels, colors_smooth):
            # 简单移动平均
            y_smooth = np.convolve(y_original, np.ones(smooth_level)/smooth_level, mode='same')
            ax1.plot(x, y_smooth, color=color, linewidth=2, 
                    label=f'平滑参数={smooth_level}')
        
        ax1.set_xlabel('X')
        ax1.set_ylabel('Y')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # 2. 下拉菜单效果(模拟不同数据集)
        ax2 = axes[0, 1]
        ax2.set_title('数据集选择效果模拟\n(不同类别数据)')
        
        datasets = {
            '销售数据': self.ts_data['sales'].values[:100],
            '利润数据': self.ts_data['profit'].values[:100],
            '客户数据': self.ts_data['customers'].values[:100]
        }
        
        colors_data = ['blue', 'green', 'red']
        
        for i, (name, data) in enumerate(datasets.items()):
            ax2.plot(data, color=colors_data[i], linewidth=2, 
                    alpha=0.7, label=name)
        
        ax2.set_xlabel('时间')
        ax2.set_ylabel('数值')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # 3. 复选框效果(模拟显示/隐藏系列)
        ax3 = axes[1, 0]
        ax3.set_title('系列显示控制模拟\n(复选框效果)')
        
        # 模拟部分系列被隐藏
        visible_series = ['A', 'C', 'D']  # B系列被隐藏
        
        for category in self.scatter_data['category'].unique():
            cat_data = self.scatter_data[self.scatter_data['category'] == category]
            
            if category in visible_series:
                ax3.scatter(cat_data['x'], cat_data['y'], 
                           c=colors[category], label=f'{category} ✓', 
                           s=50, alpha=0.8)
            else:
                # 显示为灰色(隐藏状态)
                ax3.scatter(cat_data['x'], cat_data['y'], 
                           c='lightgray', label=f'{category} ✗', 
                           s=20, alpha=0.3)
        
        ax3.set_xlabel('X 坐标')
        ax3.set_ylabel('Y 坐标')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        
        # 4. 范围选择器效果(模拟时间范围选择)
        ax4 = axes[1, 1]
        ax4.set_title('时间范围选择模拟\n(范围选择器效果)')
        
        # 完整数据(浅色)
        ax4.plot(self.ts_data['date'], self.ts_data['sales'], 
                'lightblue', linewidth=1, alpha=0.5, label='完整数据')
        
        # 选中范围(深色)
        selected_range = self.ts_data.iloc[100:200]
        ax4.plot(selected_range['date'], selected_range['sales'], 
                'blue', linewidth=3, label='选中范围')
        
        # 添加选择框
        ax4.axvspan(selected_range['date'].iloc[0], selected_range['date'].iloc[-1], 
                   alpha=0.2, color='yellow', label='选择区域')
        
        ax4.set_xlabel('日期')
        ax4.set_ylabel('销售额')
        ax4.legend()
        ax4.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

# 使用示例
interactive_viz = InteractiveVisualization()
interactive_viz.mouse_interaction_demo()
interactive_viz.animation_basics_demo()
interactive_viz.widget_simulation_demo()

6.4 本章总结

6.4.1 学习要点回顾

本章我们深入学习了数据处理与可视化的核心技能:

数据导入与清洗 - 掌握了多种数据源的处理方法(CSV、Excel、JSON、数据库) - 学会了数据清洗的基本技巧(缺失值处理、异常值检测、数据类型转换) - 了解了数据预处理的重要性和常用方法

时间序列分析 - 学习了时间序列数据的可视化技巧 - 掌握了移动平均、季节性分解等分析方法 - 了解了自相关分析和频谱分析的应用

统计分析可视化 - 学会了描述性统计的可视化表示 - 掌握了相关性分析和回归分析的图形化展示 - 了解了假设检验和方差分析的可视化方法

商业智能分析 - 学习了销售分析、用户分析等商业场景的可视化 - 掌握了关键指标仪表板的设计原则 - 了解了预测分析的可视化技巧

Pandas集成应用 - 深入学习了Pandas与Matplotlib的集成使用 - 掌握了DataFrame的直接绘图方法 - 学会了复杂数据结构的可视化技巧

可视化最佳实践 - 学习了图表类型的选择原则 - 掌握了颜色设计的基本规律 - 了解了交互式可视化的基础概念

6.4.2 实践练习

练习1:综合数据分析项目

# 创建一个完整的数据分析项目
# 1. 数据导入和清洗
# 2. 探索性数据分析
# 3. 统计分析和建模
# 4. 结果可视化和报告生成

class DataAnalysisProject:
    def __init__(self, data_source):
        self.data = self.load_and_clean_data(data_source)
    
    def load_and_clean_data(self, source):
        # 实现数据加载和清洗逻辑
        pass
    
    def exploratory_analysis(self):
        # 实现探索性数据分析
        pass
    
    def statistical_modeling(self):
        # 实现统计建模
        pass
    
    def generate_report(self):
        # 生成可视化报告
        pass

练习2:交互式仪表板设计

# 设计一个交互式仪表板
# 包含多个图表类型和交互功能

class InteractiveDashboard:
    def __init__(self):
        self.setup_layout()
        self.create_widgets()
    
    def setup_layout(self):
        # 设置布局
        pass
    
    def create_widgets(self):
        # 创建交互控件
        pass
    
    def update_charts(self, *args):
        # 更新图表
        pass

练习3:自定义可视化组件

# 创建自定义的可视化组件
# 实现特定业务需求的图表类型

class CustomVisualization:
    def __init__(self, data):
        self.data = data
    
    def create_custom_chart(self):
        # 实现自定义图表
        pass
    
    def add_interactivity(self):
        # 添加交互功能
        pass

6.4.3 常见问题解答

Q1:如何处理大数据集的可视化? A1:对于大数据集,可以采用以下策略: - 数据采样:随机抽取代表性样本 - 数据聚合:按时间或类别聚合数据 - 分批处理:将数据分批进行可视化 - 使用专门的大数据可视化工具

Q2:如何选择合适的图表类型? A2:图表选择应基于数据类型和分析目的: - 比较数据:柱状图、条形图 - 显示趋势:线图、面积图 - 显示分布:直方图、箱线图 - 显示关系:散点图、热力图 - 显示占比:饼图、环形图

Q3:如何提高图表的可读性? A3:提高可读性的方法: - 使用清晰的标题和标签 - 选择合适的颜色和字体 - 添加网格线和参考线 - 保持图表简洁,避免过度装饰 - 确保图例清晰易懂

Q4:如何处理中文字体显示问题? A4:中文字体设置方法:

plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False

6.4.4 下章预告

下一章我们将学习动画与交互式图表,内容包括:

  1. 动画制作基础

    • FuncAnimation的使用
    • 关键帧动画
    • 动画保存和导出
  2. 交互式图表进阶

    • 事件处理机制
    • 自定义交互功能
    • 响应式图表设计
  3. Web集成应用

    • Matplotlib与Web框架集成
    • 实时数据可视化
    • 在线图表分享
  4. 性能优化技巧

    • 大数据量处理
    • 内存优化
    • 渲染性能提升

通过下一章的学习,你将能够创建更加生动和互动的数据可视化作品!