本章概述

张量(Tensor)是PyTorch的核心数据结构,自动微分(Automatic Differentiation)是深度学习的核心机制。本章将深入探讨PyTorch张量的各种操作,以及自动微分系统的工作原理。通过本章学习,您将掌握张量的高级操作技巧,理解梯度计算的底层机制。

学习目标

  • 深入理解PyTorch张量的数据结构和内存模型
  • 掌握张量的创建、变换、索引和广播机制
  • 理解自动微分系统的工作原理
  • 学会梯度计算和反向传播的实现
  • 掌握计算图的构建和优化技巧

2.1 张量深入理解

2.1.1 张量的内存模型

import torch
import numpy as np
import matplotlib.pyplot as plt

def tensor_memory_model():
    """张量内存模型详解"""
    print("=== 张量内存模型 ===")
    
    # 1. 张量的存储结构
    print("1. 张量存储结构:")
    x = torch.tensor([[1, 2, 3], [4, 5, 6]])
    print(f"张量: \n{x}")
    print(f"形状: {x.shape}")
    print(f"步长: {x.stride()}")
    print(f"存储偏移: {x.storage_offset()}")
    print(f"是否连续: {x.is_contiguous()}")
    
    # 2. 视图 vs 拷贝
    print("\n2. 视图 vs 拷贝:")
    
    # 视图操作(共享内存)
    y = x.view(3, 2)
    print(f"原张量: \n{x}")
    print(f"视图张量: \n{y}")
    print(f"共享存储: {x.storage().data_ptr() == y.storage().data_ptr()}")
    
    # 修改原张量,视图也会改变
    x[0, 0] = 100
    print(f"修改后原张量: \n{x}")
    print(f"修改后视图: \n{y}")
    
    # 拷贝操作(独立内存)
    z = x.clone()
    x[0, 0] = 1
    print(f"拷贝张量: \n{z}")
    print(f"独立存储: {x.storage().data_ptr() != z.storage().data_ptr()}")
    
    # 3. 内存布局
    print("\n3. 内存布局:")
    
    # 行优先 vs 列优先
    row_major = torch.arange(12).reshape(3, 4)
    col_major = row_major.t().contiguous().t()
    
    print(f"行优先步长: {row_major.stride()}")
    print(f"列优先步长: {col_major.stride()}")
    print(f"行优先连续: {row_major.is_contiguous()}")
    print(f"列优先连续: {col_major.is_contiguous()}")

# 运行内存模型演示
tensor_memory_model()

2.1.2 张量数据类型

def tensor_data_types():
    """张量数据类型详解"""
    print("\n=== 张量数据类型 ===")
    
    # 1. 基本数据类型
    print("1. 基本数据类型:")
    data_types = {
        torch.float32: "32位浮点数 (默认)",
        torch.float64: "64位浮点数",
        torch.float16: "16位浮点数",
        torch.int32: "32位整数",
        torch.int64: "64位整数 (默认)",
        torch.int8: "8位整数",
        torch.uint8: "8位无符号整数",
        torch.bool: "布尔类型"
    }
    
    for dtype, description in data_types.items():
        x = torch.tensor([1.0], dtype=dtype)
        print(f"{str(dtype):20} - {description:20} - 内存: {x.element_size()} bytes")
    
    # 2. 类型转换
    print("\n2. 类型转换:")
    x = torch.randn(3, 3)
    print(f"原始类型: {x.dtype}")
    
    # 不同转换方法
    x_int = x.int()
    x_long = x.long()
    x_float = x.float()
    x_double = x.double()
    x_half = x.half()
    
    print(f"转换为int: {x_int.dtype}")
    print(f"转换为long: {x_long.dtype}")
    print(f"转换为float: {x_float.dtype}")
    print(f"转换为double: {x_double.dtype}")
    print(f"转换为half: {x_half.dtype}")
    
    # 3. 类型提升
    print("\n3. 自动类型提升:")
    a = torch.tensor([1], dtype=torch.int32)
    b = torch.tensor([2.0], dtype=torch.float32)
    c = a + b
    print(f"int32 + float32 = {c.dtype}")
    
    # 4. 设备和类型
    print("\n4. 设备和类型:")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x_cpu = torch.randn(3, 3, dtype=torch.float32)
    x_gpu = x_cpu.to(device=device, dtype=torch.float16)
    
    print(f"CPU张量: {x_cpu.device}, {x_cpu.dtype}")
    print(f"GPU张量: {x_gpu.device}, {x_gpu.dtype}")

# 运行数据类型演示
tensor_data_types()

2.1.3 张量创建方法

def tensor_creation_methods():
    """张量创建方法大全"""
    print("\n=== 张量创建方法 ===")
    
    # 1. 从数据创建
    print("1. 从数据创建:")
    
    # 从列表创建
    list_tensor = torch.tensor([[1, 2], [3, 4]])
    print(f"从列表: \n{list_tensor}")
    
    # 从NumPy数组创建
    np_array = np.array([[1, 2], [3, 4]])
    np_tensor = torch.from_numpy(np_array)
    print(f"从NumPy: \n{np_tensor}")
    
    # 从另一个张量创建
    like_tensor = torch.zeros_like(list_tensor)
    print(f"zeros_like: \n{like_tensor}")
    
    # 2. 特殊值张量
    print("\n2. 特殊值张量:")
    
    zeros = torch.zeros(2, 3)
    ones = torch.ones(2, 3)
    full = torch.full((2, 3), 7)
    eye = torch.eye(3)
    
    print(f"zeros: \n{zeros}")
    print(f"ones: \n{ones}")
    print(f"full: \n{full}")
    print(f"eye: \n{eye}")
    
    # 3. 随机张量
    print("\n3. 随机张量:")
    
    # 设置随机种子
    torch.manual_seed(42)
    
    rand = torch.rand(2, 3)  # [0, 1) 均匀分布
    randn = torch.randn(2, 3)  # 标准正态分布
    randint = torch.randint(0, 10, (2, 3))  # 随机整数
    randperm = torch.randperm(10)  # 随机排列
    
    print(f"rand: \n{rand}")
    print(f"randn: \n{randn}")
    print(f"randint: \n{randint}")
    print(f"randperm: {randperm}")
    
    # 4. 序列张量
    print("\n4. 序列张量:")
    
    arange = torch.arange(0, 10, 2)
    linspace = torch.linspace(0, 1, 5)
    logspace = torch.logspace(0, 2, 5)
    
    print(f"arange: {arange}")
    print(f"linspace: {linspace}")
    print(f"logspace: {logspace}")
    
    # 5. 高级创建方法
    print("\n5. 高级创建方法:")
    
    # 网格张量
    x = torch.linspace(-1, 1, 3)
    y = torch.linspace(-1, 1, 3)
    grid_x, grid_y = torch.meshgrid(x, y, indexing='ij')
    
    print(f"meshgrid x: \n{grid_x}")
    print(f"meshgrid y: \n{grid_y}")
    
    # 对角张量
    diag = torch.diag(torch.tensor([1, 2, 3]))
    print(f"diag: \n{diag}")

# 运行张量创建演示
tensor_creation_methods()

2.2 张量操作详解

2.2.1 形状变换操作

def tensor_shape_operations():
    """张量形状变换操作"""
    print("\n=== 张量形状变换 ===")
    
    # 创建示例张量
    x = torch.arange(24).reshape(2, 3, 4)
    print(f"原始张量形状: {x.shape}")
    print(f"原始张量: \n{x}")
    
    # 1. 基本变换
    print("\n1. 基本变换:")
    
    # reshape vs view
    reshaped = x.reshape(6, 4)
    viewed = x.view(6, 4)
    print(f"reshape: {reshaped.shape}")
    print(f"view: {viewed.shape}")
    print(f"连续性: reshape={reshaped.is_contiguous()}, view={viewed.is_contiguous()}")
    
    # flatten
    flattened = x.flatten()
    print(f"flatten: {flattened.shape}")
    
    # squeeze 和 unsqueeze
    y = torch.randn(1, 3, 1, 4)
    squeezed = y.squeeze()  # 移除大小为1的维度
    unsqueezed = squeezed.unsqueeze(0)  # 添加维度
    
    print(f"原始: {y.shape}")
    print(f"squeeze: {squeezed.shape}")
    print(f"unsqueeze: {unsqueezed.shape}")
    
    # 2. 维度操作
    print("\n2. 维度操作:")
    
    # transpose
    transposed = x.transpose(0, 2)
    print(f"transpose(0,2): {transposed.shape}")
    
    # permute
    permuted = x.permute(2, 0, 1)
    print(f"permute(2,0,1): {permuted.shape}")
    
    # 矩阵转置
    matrix = torch.randn(3, 4)
    print(f"矩阵转置: {matrix.shape} -> {matrix.t().shape}")
    
    # 3. 高级变换
    print("\n3. 高级变换:")
    
    # chunk - 分块
    chunks = x.chunk(2, dim=0)
    print(f"chunk成{len(chunks)}块,每块形状: {[c.shape for c in chunks]}")
    
    # split - 分割
    splits = x.split([1, 1], dim=0)
    print(f"split成{len(splits)}部分,每部分形状: {[s.shape for s in splits]}")
    
    # repeat - 重复
    repeated = torch.tensor([1, 2]).repeat(2, 3)
    print(f"repeat: {repeated}")
    
    # expand - 扩展
    small = torch.tensor([[1], [2]])
    expanded = small.expand(2, 3)
    print(f"expand: \n{expanded}")
    print(f"expand共享内存: {small.storage().data_ptr() == expanded.storage().data_ptr()}")

# 运行形状变换演示
tensor_shape_operations()

2.2.2 索引和切片

def tensor_indexing_slicing():
    """张量索引和切片操作"""
    print("\n=== 张量索引和切片 ===")
    
    # 创建示例张量
    x = torch.arange(24).reshape(4, 6)
    print(f"原始张量: \n{x}")
    
    # 1. 基本索引
    print("\n1. 基本索引:")
    print(f"x[0]: {x[0]}")  # 第一行
    print(f"x[:, 0]: {x[:, 0]}")  # 第一列
    print(f"x[1, 3]: {x[1, 3]}")  # 单个元素
    print(f"x[1:3, 2:5]: \n{x[1:3, 2:5]}")  # 切片
    
    # 2. 高级索引
    print("\n2. 高级索引:")
    
    # 布尔索引
    mask = x > 10
    print(f"mask (x > 10): \n{mask}")
    print(f"x[mask]: {x[mask]}")
    
    # 整数数组索引
    rows = torch.tensor([0, 2, 3])
    cols = torch.tensor([1, 3, 5])
    print(f"x[rows, cols]: {x[rows, cols]}")
    
    # 3. 花式索引
    print("\n3. 花式索引:")
    
    # gather - 按索引收集
    indices = torch.tensor([[0, 1, 2], [3, 4, 5]])
    gathered = x.gather(1, indices)
    print(f"gather结果: \n{gathered}")
    
    # scatter - 按索引分散
    y = torch.zeros(4, 6)
    y.scatter_(1, indices, 100)
    print(f"scatter结果: \n{y}")
    
    # index_select - 按索引选择
    selected = x.index_select(0, torch.tensor([0, 2]))
    print(f"index_select结果: \n{selected}")
    
    # 4. 条件索引
    print("\n4. 条件索引:")
    
    # where - 条件选择
    condition = x > 15
    result = torch.where(condition, x, torch.zeros_like(x))
    print(f"where结果: \n{result}")
    
    # masked_select - 掩码选择
    masked = x.masked_select(x > 15)
    print(f"masked_select结果: {masked}")
    
    # nonzero - 非零元素索引
    nonzero_indices = (x > 15).nonzero()
    print(f"nonzero索引: \n{nonzero_indices}")

# 运行索引切片演示
tensor_indexing_slicing()

2.2.3 广播机制

def broadcasting_mechanism():
    """广播机制详解"""
    print("\n=== 广播机制 ===")
    
    # 1. 广播规则
    print("1. 广播规则:")
    print("- 从右向左比较维度")
    print("- 维度大小相等或其中一个为1")
    print("- 缺失维度视为1")
    
    # 2. 广播示例
    print("\n2. 广播示例:")
    
    # 标量与张量
    a = torch.tensor([[1, 2, 3], [4, 5, 6]])
    b = 10
    result1 = a + b
    print(f"张量 + 标量:")
    print(f"a: {a.shape} \n{a}")
    print(f"b: 标量 {b}")
    print(f"结果: \n{result1}")
    
    # 不同形状张量
    c = torch.tensor([[1], [2]])  # (2, 1)
    d = torch.tensor([10, 20, 30])  # (3,)
    result2 = c + d
    print(f"\n不同形状张量:")
    print(f"c: {c.shape} \n{c}")
    print(f"d: {d.shape} {d}")
    print(f"结果: {result2.shape} \n{result2}")
    
    # 3. 广播可视化
    print("\n3. 广播过程可视化:")
    
    def visualize_broadcast(x, y):
        print(f"原始形状: x{x.shape}, y{y.shape}")
        
        # 检查是否可以广播
        try:
            result = x + y
            print(f"广播后形状: {result.shape}")
            print(f"广播成功 ✓")
            return result
        except RuntimeError as e:
            print(f"广播失败 ✗: {e}")
            return None
    
    # 可广播的例子
    x1 = torch.randn(3, 1, 4)
    y1 = torch.randn(1, 2, 1)
    visualize_broadcast(x1, y1)
    
    # 不可广播的例子
    x2 = torch.randn(3, 4)
    y2 = torch.randn(2, 3)
    visualize_broadcast(x2, y2)
    
    # 4. 手动广播
    print("\n4. 手动广播:")
    
    a = torch.tensor([[1, 2]])  # (1, 2)
    b = torch.tensor([[3], [4]])  # (2, 1)
    
    # 使用expand手动广播
    a_expanded = a.expand(2, 2)
    b_expanded = b.expand(2, 2)
    
    print(f"a原始: {a.shape} \n{a}")
    print(f"a扩展: {a_expanded.shape} \n{a_expanded}")
    print(f"b原始: {b.shape} \n{b}")
    print(f"b扩展: {b_expanded.shape} \n{b_expanded}")
    print(f"手动广播结果: \n{a_expanded + b_expanded}")
    
    # 5. 广播性能考虑
    print("\n5. 广播性能:")
    
    import time
    
    # 大张量广播测试
    large_a = torch.randn(1000, 1)
    large_b = torch.randn(1, 1000)
    
    start = time.time()
    for _ in range(100):
        result = large_a + large_b
    broadcast_time = time.time() - start
    
    # 预先扩展
    expanded_a = large_a.expand(1000, 1000)
    expanded_b = large_b.expand(1000, 1000)
    
    start = time.time()
    for _ in range(100):
        result = expanded_a + expanded_b
    expanded_time = time.time() - start
    
    print(f"广播时间: {broadcast_time:.4f}s")
    print(f"预扩展时间: {expanded_time:.4f}s")

# 运行广播机制演示
broadcasting_mechanism()

2.3 数学运算

2.3.1 基本数学运算

def basic_math_operations():
    """基本数学运算"""
    print("\n=== 基本数学运算 ===")
    
    # 创建示例张量
    a = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
    b = torch.tensor([[5.0, 6.0], [7.0, 8.0]])
    
    print(f"a: \n{a}")
    print(f"b: \n{b}")
    
    # 1. 算术运算
    print("\n1. 算术运算:")
    print(f"加法 a + b: \n{a + b}")
    print(f"减法 a - b: \n{a - b}")
    print(f"乘法 a * b: \n{a * b}")
    print(f"除法 a / b: \n{a / b}")
    print(f"幂运算 a ** 2: \n{a ** 2}")
    print(f"取模 a % 3: \n{a % 3}")
    
    # 2. 就地运算
    print("\n2. 就地运算:")
    c = a.clone()
    print(f"原始 c: \n{c}")
    c.add_(1)  # 就地加法
    print(f"c.add_(1): \n{c}")
    c.mul_(2)  # 就地乘法
    print(f"c.mul_(2): \n{c}")
    
    # 3. 比较运算
    print("\n3. 比较运算:")
    print(f"a > 2: \n{a > 2}")
    print(f"a == b: \n{a == b}")
    print(f"torch.eq(a, 2): \n{torch.eq(a, 2)}")
    print(f"torch.gt(a, 2): \n{torch.gt(a, 2)}")
    
    # 4. 逻辑运算
    print("\n4. 逻辑运算:")
    mask1 = a > 2
    mask2 = b < 7
    print(f"mask1 (a > 2): \n{mask1}")
    print(f"mask2 (b < 7): \n{mask2}")
    print(f"mask1 & mask2: \n{mask1 & mask2}")
    print(f"mask1 | mask2: \n{mask1 | mask2}")
    print(f"~mask1: \n{~mask1}")

# 运行基本数学运算演示
basic_math_operations()

2.3.2 线性代数运算

def linear_algebra_operations():
    """线性代数运算"""
    print("\n=== 线性代数运算 ===")
    
    # 1. 矩阵乘法
    print("1. 矩阵乘法:")
    A = torch.randn(3, 4)
    B = torch.randn(4, 5)
    
    # 不同的矩阵乘法方式
    C1 = torch.mm(A, B)  # 2D矩阵乘法
    C2 = A @ B  # 运算符重载
    C3 = torch.matmul(A, B)  # 通用矩阵乘法
    
    print(f"A: {A.shape}")
    print(f"B: {B.shape}")
    print(f"C: {C1.shape}")
    print(f"结果一致: {torch.allclose(C1, C2) and torch.allclose(C2, C3)}")
    
    # 批量矩阵乘法
    batch_A = torch.randn(10, 3, 4)
    batch_B = torch.randn(10, 4, 5)
    batch_C = torch.bmm(batch_A, batch_B)
    print(f"批量矩阵乘法: {batch_A.shape} @ {batch_B.shape} = {batch_C.shape}")
    
    # 2. 向量运算
    print("\n2. 向量运算:")
    u = torch.tensor([1.0, 2.0, 3.0])
    v = torch.tensor([4.0, 5.0, 6.0])
    
    # 点积
    dot_product = torch.dot(u, v)
    print(f"点积 u·v: {dot_product}")
    
    # 外积
    outer_product = torch.outer(u, v)
    print(f"外积 u⊗v: \n{outer_product}")
    
    # 叉积 (3D向量)
    cross_product = torch.cross(u, v)
    print(f"叉积 u×v: {cross_product}")
    
    # 3. 矩阵分解
    print("\n3. 矩阵分解:")
    
    # 创建对称正定矩阵
    X = torch.randn(4, 4)
    A = X @ X.t() + torch.eye(4)  # 确保正定
    
    # 特征值分解
    eigenvals, eigenvecs = torch.linalg.eig(A)
    print(f"特征值: {eigenvals.real}")
    
    # SVD分解
    U, S, Vh = torch.linalg.svd(A)
    print(f"SVD: U{U.shape}, S{S.shape}, Vh{Vh.shape}")
    
    # Cholesky分解
    L = torch.linalg.cholesky(A)
    print(f"Cholesky分解验证: {torch.allclose(L @ L.t(), A)}")
    
    # QR分解
    Q, R = torch.linalg.qr(A)
    print(f"QR分解验证: {torch.allclose(Q @ R, A)}")
    
    # 4. 矩阵性质
    print("\n4. 矩阵性质:")
    
    # 行列式
    det = torch.linalg.det(A)
    print(f"行列式: {det}")
    
    # 矩阵的秩
    rank = torch.linalg.matrix_rank(A)
    print(f"矩阵的秩: {rank}")
    
    # 条件数
    cond = torch.linalg.cond(A)
    print(f"条件数: {cond}")
    
    # 矩阵范数
    frobenius_norm = torch.linalg.matrix_norm(A, 'fro')
    spectral_norm = torch.linalg.matrix_norm(A, 2)
    print(f"Frobenius范数: {frobenius_norm}")
    print(f"谱范数: {spectral_norm}")
    
    # 5. 求解线性方程组
    print("\n5. 线性方程组求解:")
    
    # Ax = b
    b = torch.randn(4)
    x = torch.linalg.solve(A, b)
    print(f"方程组解验证: {torch.allclose(A @ x, b)}")
    
    # 最小二乘解
    A_rect = torch.randn(6, 4)  # 超定系统
    b_rect = torch.randn(6)
    x_lstsq = torch.linalg.lstsq(A_rect, b_rect).solution
    print(f"最小二乘解形状: {x_lstsq.shape}")

# 运行线性代数运算演示
linear_algebra_operations()

2.3.3 统计和聚合运算

def statistical_operations():
    """统计和聚合运算"""
    print("\n=== 统计和聚合运算 ===")
    
    # 创建示例数据
    torch.manual_seed(42)
    data = torch.randn(4, 5)
    print(f"数据: \n{data}")
    
    # 1. 基本统计量
    print("\n1. 基本统计量:")
    print(f"均值: {data.mean()}")
    print(f"标准差: {data.std()}")
    print(f"方差: {data.var()}")
    print(f"最小值: {data.min()}")
    print(f"最大值: {data.max()}")
    print(f"中位数: {data.median()}")
    
    # 按维度统计
    print(f"\n按行统计 (dim=1):")
    print(f"行均值: {data.mean(dim=1)}")
    print(f"行标准差: {data.std(dim=1)}")
    
    print(f"\n按列统计 (dim=0):")
    print(f"列均值: {data.mean(dim=0)}")
    print(f"列标准差: {data.std(dim=0)}")
    
    # 2. 聚合运算
    print("\n2. 聚合运算:")
    print(f"求和: {data.sum()}")
    print(f"乘积: {data.prod()}")
    print(f"累积和: {data.cumsum(dim=0)}")
    print(f"累积乘积: {data.cumprod(dim=1)}")
    
    # 3. 排序和排名
    print("\n3. 排序和排名:")
    
    # 排序
    sorted_data, indices = data.sort(dim=1)
    print(f"按行排序: \n{sorted_data}")
    print(f"排序索引: \n{indices}")
    
    # topk
    values, indices = data.topk(3, dim=1)
    print(f"每行top3值: \n{values}")
    print(f"每行top3索引: \n{indices}")
    
    # 4. 分位数
    print("\n4. 分位数:")
    print(f"25%分位数: {data.quantile(0.25)}")
    print(f"50%分位数: {data.quantile(0.5)}")
    print(f"75%分位数: {data.quantile(0.75)}")
    
    # 5. 直方图
    print("\n5. 直方图:")
    hist = torch.histc(data.flatten(), bins=10, min=-3, max=3)
    print(f"直方图: {hist}")
    
    # 6. 相关性分析
    print("\n6. 相关性分析:")
    
    # 协方差矩阵
    cov_matrix = torch.cov(data.t())  # 转置后计算列之间的协方差
    print(f"协方差矩阵: \n{cov_matrix}")
    
    # 相关系数矩阵
    corr_matrix = torch.corrcoef(data.t())
    print(f"相关系数矩阵: \n{corr_matrix}")

# 运行统计运算演示
statistical_operations()

2.4 自动微分系统

2.4.1 自动微分基础

def autograd_basics():
    """自动微分基础"""
    print("\n=== 自动微分基础 ===")
    
    # 1. 启用梯度追踪
    print("1. 梯度追踪:")
    
    # 创建需要梯度的张量
    x = torch.tensor([2.0], requires_grad=True)
    y = torch.tensor([3.0], requires_grad=True)
    
    print(f"x: {x}, requires_grad: {x.requires_grad}")
    print(f"y: {y}, requires_grad: {y.requires_grad}")
    
    # 计算函数值
    z = x**2 + 2*x*y + y**2
    print(f"z = x² + 2xy + y² = {z}")
    
    # 反向传播
    z.backward()
    
    print(f"∂z/∂x = {x.grad}")  # 应该是 2x + 2y = 2*2 + 2*3 = 10
    print(f"∂z/∂y = {y.grad}")  # 应该是 2x + 2y = 2*2 + 2*3 = 10
    
    # 2. 梯度累积
    print("\n2. 梯度累积:")
    
    x.grad.zero_()  # 清零梯度
    y.grad.zero_()
    
    # 第一次计算
    z1 = x**2
    z1.backward()
    print(f"第一次: ∂(x²)/∂x = {x.grad}")
    
    # 第二次计算(不清零梯度)
    z2 = 3*x
    z2.backward()
    print(f"累积后: ∂(x² + 3x)/∂x = {x.grad}")
    
    # 3. 计算图可视化
    print("\n3. 计算图:")
    
    x = torch.tensor([1.0], requires_grad=True)
    a = x + 2
    b = a * 3
    c = b**2
    
    print(f"x -> a=x+2 -> b=a*3 -> c=b²")
    print(f"x={x.item()}, a={a.item()}, b={b.item()}, c={c.item()}")
    
    c.backward()
    print(f"∂c/∂x = {x.grad}")  # 链式法则: 2b * 3 * 1 = 2*9*3 = 54

# 运行自动微分基础演示
autograd_basics()

2.4.2 高级自动微分

def advanced_autograd():
    """高级自动微分技术"""
    print("\n=== 高级自动微分 ===")
    
    # 1. 高阶导数
    print("1. 高阶导数:")
    
    x = torch.tensor([2.0], requires_grad=True)
    y = x**4 + 2*x**3 + x**2
    
    # 一阶导数
    grad1 = torch.autograd.grad(y, x, create_graph=True)[0]
    print(f"f(x) = x⁴ + 2x³ + x²")
    print(f"f'(x) = {grad1}")
    
    # 二阶导数
    grad2 = torch.autograd.grad(grad1, x, create_graph=True)[0]
    print(f"f''(x) = {grad2}")
    
    # 三阶导数
    grad3 = torch.autograd.grad(grad2, x)[0]
    print(f"f'''(x) = {grad3}")
    
    # 2. 雅可比矩阵
    print("\n2. 雅可比矩阵:")
    
    def vector_function(x):
        """向量函数 f: R² -> R²"""
        return torch.stack([
            x[0]**2 + x[1],
            x[0] * x[1]**2
        ])
    
    x = torch.tensor([2.0, 3.0], requires_grad=True)
    y = vector_function(x)
    
    # 计算雅可比矩阵
    jacobian = torch.zeros(2, 2)
    for i in range(2):
        grad = torch.autograd.grad(y[i], x, retain_graph=True)[0]
        jacobian[i] = grad
    
    print(f"输入: {x}")
    print(f"输出: {y}")
    print(f"雅可比矩阵: \n{jacobian}")
    
    # 3. 海塞矩阵
    print("\n3. 海塞矩阵:")
    
    def scalar_function(x):
        """标量函数 f: R² -> R"""
        return x[0]**2 * x[1] + x[1]**3
    
    x = torch.tensor([1.0, 2.0], requires_grad=True)
    y = scalar_function(x)
    
    # 计算海塞矩阵
    grad = torch.autograd.grad(y, x, create_graph=True)[0]
    hessian = torch.zeros(2, 2)
    
    for i in range(2):
        grad2 = torch.autograd.grad(grad[i], x, retain_graph=True)[0]
        hessian[i] = grad2
    
    print(f"函数值: {y}")
    print(f"梯度: {grad}")
    print(f"海塞矩阵: \n{hessian}")
    
    # 4. 函数式API
    print("\n4. 函数式API:")
    
    # torch.func.jacrev - 雅可比矩阵(反向模式)
    try:
        import torch.func as func
        
        def f(x):
            return x**2 + torch.sin(x)
        
        x = torch.tensor([1.0, 2.0])
        jac = func.jacrev(f)(x)
        print(f"函数式雅可比: {jac}")
        
    except ImportError:
        print("torch.func 不可用,跳过函数式API演示")

# 运行高级自动微分演示
advanced_autograd()

2.4.3 梯度控制技术

def gradient_control():
    """梯度控制技术"""
    print("\n=== 梯度控制技术 ===")
    
    # 1. 梯度上下文管理
    print("1. 梯度上下文管理:")
    
    x = torch.tensor([1.0], requires_grad=True)
    
    # 正常计算(有梯度)
    y1 = x**2
    print(f"正常计算: y1.requires_grad = {y1.requires_grad}")
    
    # 无梯度计算
    with torch.no_grad():
        y2 = x**2
        print(f"no_grad: y2.requires_grad = {y2.requires_grad}")
    
    # 推理模式
    with torch.inference_mode():
        y3 = x**2
        print(f"inference_mode: y3.requires_grad = {y3.requires_grad}")
    
    # 2. 梯度检查点
    print("\n2. 梯度检查点:")
    
    def expensive_function(x):
        """模拟计算昂贵的函数"""
        for _ in range(1000):
            x = torch.sin(x)
        return x
    
    x = torch.tensor([1.0], requires_grad=True)
    
    # 使用检查点节省内存
    y = torch.utils.checkpoint.checkpoint(expensive_function, x)
    y.backward()
    print(f"检查点梯度: {x.grad}")
    
    # 3. 梯度裁剪
    print("\n3. 梯度裁剪:")
    
    # 创建一个简单模型
    import torch.nn as nn
    
    model = nn.Linear(10, 1)
    x = torch.randn(5, 10)
    y = torch.randn(5, 1)
    
    # 计算损失和梯度
    loss = nn.MSELoss()(model(x), y)
    loss.backward()
    
    # 梯度裁剪前
    grad_norm_before = torch.nn.utils.clip_grad_norm_(model.parameters(), float('inf'))
    print(f"裁剪前梯度范数: {grad_norm_before}")
    
    # 重新计算梯度
    model.zero_grad()
    loss.backward()
    
    # 梯度裁剪
    max_norm = 1.0
    grad_norm_after = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
    print(f"裁剪后梯度范数: {grad_norm_after}")
    
    # 4. 自定义梯度函数
    print("\n4. 自定义梯度函数:")
    
    class CustomFunction(torch.autograd.Function):
        @staticmethod
        def forward(ctx, x):
            # 保存用于反向传播的张量
            ctx.save_for_backward(x)
            return x**2
        
        @staticmethod
        def backward(ctx, grad_output):
            # 获取保存的张量
            x, = ctx.saved_tensors
            # 返回输入的梯度
            return grad_output * 2 * x
    
    # 使用自定义函数
    custom_func = CustomFunction.apply
    x = torch.tensor([3.0], requires_grad=True)
    y = custom_func(x)
    y.backward()
    print(f"自定义函数梯度: {x.grad}")
    
    # 5. 梯度钩子
    print("\n5. 梯度钩子:")
    
    def gradient_hook(grad):
        print(f"梯度钩子捕获: {grad}")
        return grad * 2  # 修改梯度
    
    x = torch.tensor([2.0], requires_grad=True)
    x.register_hook(gradient_hook)
    
    y = x**3
    y.backward()
    print(f"钩子修改后梯度: {x.grad}")

# 运行梯度控制演示
gradient_control()

2.5 计算图优化

2.5.1 计算图分析

def computational_graph_analysis():
    """计算图分析"""
    print("\n=== 计算图分析 ===")
    
    # 1. 计算图构建
    print("1. 计算图构建:")
    
    x = torch.tensor([2.0], requires_grad=True)
    y = torch.tensor([3.0], requires_grad=True)
    
    # 构建复杂计算图
    a = x + y
    b = x * y
    c = a / b
    d = torch.sin(c)
    e = d**2
    
    print(f"计算图: x,y -> a=x+y, b=x*y -> c=a/b -> d=sin(c) -> e=d²")
    print(f"最终结果: {e}")
    
    # 2. 计算图可视化信息
    print("\n2. 计算图信息:")
    
    def print_graph_info(tensor, name):
        if tensor.grad_fn is not None:
            print(f"{name}: {tensor.grad_fn}")
            print(f"  输入: {[inp[0].grad_fn if inp[0].grad_fn else 'leaf' for inp in tensor.grad_fn.next_functions]}")
    
    print_graph_info(a, "a")
    print_graph_info(b, "b") 
    print_graph_info(c, "c")
    print_graph_info(d, "d")
    print_graph_info(e, "e")
    
    # 3. 叶子节点检查
    print("\n3. 叶子节点:")
    for name, tensor in [("x", x), ("y", y), ("a", a), ("e", e)]:
        print(f"{name}: is_leaf={tensor.is_leaf}, requires_grad={tensor.requires_grad}")
    
    # 4. 计算图释放
    print("\n4. 计算图管理:")
    
    # 保留计算图
    e.backward(retain_graph=True)
    print(f"第一次反向传播: x.grad={x.grad}, y.grad={y.grad}")
    
    # 再次反向传播(需要retain_graph=True)
    x.grad.zero_()
    y.grad.zero_()
    e.backward()
    print(f"第二次反向传播: x.grad={x.grad}, y.grad={y.grad}")

# 运行计算图分析演示
computational_graph_analysis()

2.5.2 内存优化技术

def memory_optimization():
    """内存优化技术"""
    print("\n=== 内存优化技术 ===")
    
    # 1. 就地操作
    print("1. 就地操作:")
    
    x = torch.randn(1000, 1000)
    print(f"原始内存地址: {x.data_ptr()}")
    
    # 非就地操作(创建新张量)
    y = x + 1
    print(f"非就地操作地址: {y.data_ptr()}")
    print(f"地址相同: {x.data_ptr() == y.data_ptr()}")
    
    # 就地操作(修改原张量)
    x.add_(1)
    print(f"就地操作后地址: {x.data_ptr()}")
    
    # 2. 内存池管理
    print("\n2. GPU内存管理:")
    
    if torch.cuda.is_available():
        device = torch.device("cuda")
        
        # 清空缓存
        torch.cuda.empty_cache()
        print(f"清空后GPU内存: {torch.cuda.memory_allocated() / 1e6:.1f} MB")
        
        # 分配大张量
        large_tensor = torch.randn(1000, 1000, device=device)
        print(f"分配后GPU内存: {torch.cuda.memory_allocated() / 1e6:.1f} MB")
        
        # 删除张量
        del large_tensor
        print(f"删除后GPU内存: {torch.cuda.memory_allocated() / 1e6:.1f} MB")
        
        # 清空缓存
        torch.cuda.empty_cache()
        print(f"清空缓存后GPU内存: {torch.cuda.memory_allocated() / 1e6:.1f} MB")
    
    # 3. 梯度累积
    print("\n3. 梯度累积:")
    
    # 模拟大批量训练
    model = torch.nn.Linear(100, 10)
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    
    # 传统方法(大批量)
    large_batch = torch.randn(1000, 100)
    large_target = torch.randn(1000, 10)
    
    # 梯度累积方法(小批量)
    accumulation_steps = 10
    small_batch_size = 100
    
    optimizer.zero_grad()
    for i in range(accumulation_steps):
        start_idx = i * small_batch_size
        end_idx = start_idx + small_batch_size
        
        small_batch = large_batch[start_idx:end_idx]
        small_target = large_target[start_idx:end_idx]
        
        output = model(small_batch)
        loss = criterion(output, small_target)
        loss = loss / accumulation_steps  # 平均损失
        loss.backward()
    
    optimizer.step()
    print("梯度累积完成")
    
    # 4. 混合精度训练
    print("\n4. 混合精度训练:")
    
    if torch.cuda.is_available():
        # 自动混合精度
        scaler = torch.cuda.amp.GradScaler()
        
        model = model.cuda()
        x = torch.randn(32, 100, device='cuda')
        target = torch.randn(32, 10, device='cuda')
        
        optimizer.zero_grad()
        
        with torch.cuda.amp.autocast():
            output = model(x)
            loss = criterion(output, target)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        print("混合精度训练完成")

# 运行内存优化演示
memory_optimization()

2.6 实践案例

2.6.1 自定义激活函数

def custom_activation_function():
    """自定义激活函数案例"""
    print("\n=== 自定义激活函数 ===")
    
    # 1. 使用Function类定义
    class Swish(torch.autograd.Function):
        @staticmethod
        def forward(ctx, x):
            # Swish: x * sigmoid(x)
            sigmoid_x = torch.sigmoid(x)
            output = x * sigmoid_x
            
            # 保存用于反向传播的值
            ctx.save_for_backward(x, sigmoid_x, output)
            return output
        
        @staticmethod
        def backward(ctx, grad_output):
            # 获取保存的值
            x, sigmoid_x, output = ctx.saved_tensors
            
            # 计算导数: d/dx[x * sigmoid(x)] = sigmoid(x) + x * sigmoid(x) * (1 - sigmoid(x))
            grad_input = sigmoid_x + output * (1 - sigmoid_x)
            
            return grad_output * grad_input
    
    # 2. 测试自定义激活函数
    swish = Swish.apply
    
    x = torch.linspace(-3, 3, 100, requires_grad=True)
    y = swish(x)
    
    # 计算梯度
    grad_outputs = torch.ones_like(y)
    x_grad = torch.autograd.grad(y, x, grad_outputs)[0]
    
    # 3. 与PyTorch内置版本比较
    y_builtin = x * torch.sigmoid(x)
    x_grad_builtin = torch.autograd.grad(y_builtin, x, grad_outputs)[0]
    
    print(f"自定义版本与内置版本一致: {torch.allclose(y, y_builtin)}")
    print(f"梯度一致: {torch.allclose(x_grad, x_grad_builtin)}")
    
    # 4. 可视化
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(x.detach().numpy(), y.detach().numpy(), label='Swish')
    plt.plot(x.detach().numpy(), torch.relu(x).detach().numpy(), label='ReLU')
    plt.plot(x.detach().numpy(), torch.tanh(x).detach().numpy(), label='Tanh')
    plt.title('Activation Functions')
    plt.xlabel('x')
    plt.ylabel('f(x)')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.plot(x.detach().numpy(), x_grad.detach().numpy(), label="Swish'")
    plt.title('Derivatives')
    plt.xlabel('x')
    plt.ylabel("f'(x)")
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

# 运行自定义激活函数演示
custom_activation_function()

2.6.2 数值优化问题

def numerical_optimization():
    """数值优化问题案例"""
    print("\n=== 数值优化问题 ===")
    
    # 1. 函数最小化
    print("1. 函数最小化:")
    
    # 目标函数: f(x, y) = (x-1)² + (y-2)² + sin(x*y)
    def objective_function(params):
        x, y = params
        return (x - 1)**2 + (y - 2)**2 + torch.sin(x * y)
    
    # 初始化参数
    params = torch.tensor([0.0, 0.0], requires_grad=True)
    optimizer = torch.optim.Adam([params], lr=0.1)
    
    # 优化过程
    history = []
    for i in range(100):
        optimizer.zero_grad()
        loss = objective_function(params)
        loss.backward()
        optimizer.step()
        
        history.append(loss.item())
        
        if i % 20 == 0:
            print(f"Step {i}: loss={loss.item():.6f}, params={params.detach().numpy()}")
    
    print(f"最终结果: x={params[0].item():.4f}, y={params[1].item():.4f}")
    
    # 2. 约束优化
    print("\n2. 约束优化:")
    
    # 目标: 最小化 x² + y² 约束于 x + y = 1
    def constrained_objective(params):
        x, y = params
        objective = x**2 + y**2
        constraint = (x + y - 1)**2  # 惩罚项
        return objective + 10 * constraint  # 拉格朗日乘数法的简化版本
    
    params = torch.tensor([0.0, 0.0], requires_grad=True)
    optimizer = torch.optim.LBFGS([params])
    
    def closure():
        optimizer.zero_grad()
        loss = constrained_objective(params)
        loss.backward()
        return loss
    
    for i in range(10):
        loss = optimizer.step(closure)
        if i % 2 == 0:
            x, y = params.detach().numpy()
            print(f"Step {i}: x={x:.4f}, y={y:.4f}, constraint={x+y:.4f}")
    
    # 3. 多变量函数拟合
    print("\n3. 多变量函数拟合:")
    
    # 生成数据: z = sin(x) * cos(y) + noise
    torch.manual_seed(42)
    n_points = 100
    x_data = torch.rand(n_points) * 4 - 2  # [-2, 2]
    y_data = torch.rand(n_points) * 4 - 2  # [-2, 2]
    z_true = torch.sin(x_data) * torch.cos(y_data)
    z_data = z_true + 0.1 * torch.randn(n_points)  # 添加噪声
    
    # 定义拟合函数: z = a*sin(b*x) * c*cos(d*y) + e
    params = torch.tensor([1.0, 1.0, 1.0, 1.0, 0.0], requires_grad=True)
    
    def fitted_function(x, y, params):
        a, b, c, d, e = params
        return a * torch.sin(b * x) * c * torch.cos(d * y) + e
    
    optimizer = torch.optim.Adam([params], lr=0.01)
    
    for i in range(1000):
        optimizer.zero_grad()
        z_pred = fitted_function(x_data, y_data, params)
        loss = torch.mean((z_pred - z_data)**2)
        loss.backward()
        optimizer.step()
        
        if i % 200 == 0:
            print(f"Epoch {i}: MSE={loss.item():.6f}")
    
    print(f"拟合参数: {params.detach().numpy()}")
    print(f"真实参数: [1, 1, 1, 1, 0]")

# 运行数值优化演示
numerical_optimization()

2.7 本章总结

2.7.1 关键概念回顾

def chapter_summary():
    """第2章关键概念总结"""
    print("\n📚 第2章总结:张量操作与自动微分")
    print("=" * 50)
    
    key_concepts = {
        "张量基础": [
            "内存模型和存储结构",
            "数据类型和设备管理",
            "视图vs拷贝机制",
            "张量创建的多种方法"
        ],
        "张量操作": [
            "形状变换和维度操作",
            "索引、切片和花式索引",
            "广播机制和自动扩展",
            "数学运算和线性代数"
        ],
        "自动微分": [
            "计算图的构建和管理",
            "梯度计算和反向传播",
            "高阶导数和雅可比矩阵",
            "梯度控制和优化技术"
        ],
        "性能优化": [
            "内存管理和就地操作",
            "梯度累积和混合精度",
            "计算图优化策略",
            "自定义函数和钩子"
        ]
    }
    
    for category, items in key_concepts.items():
        print(f"\n{category}:")
        for item in items:
            print(f"  • {item}")
    
    print(f"\n🎯 学习成果:")
    achievements = [
        "✅ 深入理解张量的内存模型和数据结构",
        "✅ 掌握各种张量操作和变换技巧",
        "✅ 理解广播机制和高级索引方法",
        "✅ 掌握自动微分系统的工作原理",
        "✅ 学会梯度计算和反向传播实现",
        "✅ 了解计算图优化和内存管理技术"
    ]
    
    for achievement in achievements:
        print(f"  {achievement}")

# 显示章节总结
chapter_summary()

2.7.2 下一章预告

def next_chapter_preview():
    """下一章内容预告"""
    print(f"\n🔮 下一章预告:神经网络基础")
    print("=" * 40)
    
    preview = [
        "🧠 nn.Module类的设计原理",
        "🔧 神经网络层的定义和使用",
        "➡️ 前向传播的实现机制",
        "🔄 参数初始化和管理",
        "📊 损失函数和激活函数",
        "🏗️ 构建复杂网络架构"
    ]
    
    for item in preview:
        print(f"  {item}")
    
    print(f"\n💡 学习建议:")
    suggestions = [
        "熟练掌握张量的各种操作",
        "理解自动微分的工作原理",
        "多练习梯度计算和反向传播",
        "准备线性代数和微积分基础"
    ]
    
    for suggestion in suggestions:
        print(f"  • {suggestion}")

# 显示下一章预告
next_chapter_preview()

2.7.3 练习题

def practice_exercises():
    """本章练习题"""
    print(f"\n📝 本章练习题")
    print("=" * 30)
    
    exercises = {
        "基础练习": [
            "创建不同类型和形状的张量",
            "实现张量的各种变换操作",
            "练习广播机制和索引操作",
            "计算简单函数的梯度"
        ],
        "进阶练习": [
            "实现自定义激活函数",
            "计算多变量函数的雅可比矩阵",
            "使用自动微分求解优化问题",
            "分析计算图的内存使用"
        ],
        "项目练习": [
            "实现数值优化算法",
            "构建自定义的自动微分系统",
            "优化大规模张量运算的性能",
            "实现混合精度训练框架"
        ]
    }
    
    for level, tasks in exercises.items():
        print(f"\n{level}:")
        for i, task in enumerate(tasks, 1):
            print(f"  {i}. {task}")
    
    print(f"\n🎯 完成建议:")
    print("  • 从基础练习开始,逐步提高难度")
    print("  • 每个练习都要动手编程实现")
    print("  • 理解底层原理,不只是使用API")
    print("  • 关注性能优化和内存管理")

# 显示练习题
practice_exercises()

2.7.4 代码示例总结

def code_examples_summary():
    """代码示例总结"""
    print(f"\n💻 本章代码示例总结")
    print("=" * 40)
    
    examples = {
        "张量操作": [
            "tensor_memory_model() - 内存模型演示",
            "tensor_data_types() - 数据类型转换",
            "tensor_creation_methods() - 创建方法大全",
            "tensor_shape_operations() - 形状变换",
            "tensor_indexing_slicing() - 索引切片",
            "broadcasting_mechanism() - 广播机制"
        ],
        "数学运算": [
            "basic_math_operations() - 基本运算",
            "linear_algebra_operations() - 线性代数",
            "statistical_operations() - 统计运算"
        ],
        "自动微分": [
            "autograd_basics() - 基础概念",
            "advanced_autograd() - 高级技术",
            "gradient_control() - 梯度控制",
            "computational_graph_analysis() - 计算图分析"
        ],
        "优化技术": [
            "memory_optimization() - 内存优化",
            "custom_activation_function() - 自定义函数",
            "numerical_optimization() - 数值优化"
        ]
    }
    
    for category, funcs in examples.items():
        print(f"\n{category}:")
        for func in funcs:
            print(f"  • {func}")
    
    print(f"\n📖 学习资源:")
    resources = [
        "PyTorch官方文档 - 张量操作",
        "PyTorch官方教程 - 自动微分",
        "《深度学习》- Goodfellow等著",
        "《PyTorch深度学习实战》"
    ]
    
    for resource in resources:
        print(f"  • {resource}")

# 显示代码示例总结
code_examples_summary()

🎉 恭喜完成第2章!

您已经成功完成了张量操作与自动微分的学习。现在您应该:

  • ✅ 深入理解PyTorch张量的内存模型和数据结构
  • ✅ 掌握各种张量操作、变换和索引技巧
  • ✅ 理解广播机制和高级数学运算
  • ✅ 掌握自动微分系统的工作原理
  • ✅ 学会梯度计算、反向传播和计算图优化
  • ✅ 了解内存管理和性能优化技术

下一步:继续学习第3章:神经网络基础,学习如何使用PyTorch构建和训练神经网络。

重要提示: - 张量操作是PyTorch的基础,务必熟练掌握 - 自动微分是深度学习的核心,理解其原理很重要 - 多做练习,特别是梯度计算和性能优化 - 关注内存使用,这在实际项目中非常重要

继续加油!🚀