本章概述
张量(Tensor)是PyTorch的核心数据结构,自动微分(Automatic Differentiation)是深度学习的核心机制。本章将深入探讨PyTorch张量的各种操作,以及自动微分系统的工作原理。通过本章学习,您将掌握张量的高级操作技巧,理解梯度计算的底层机制。
学习目标
- 深入理解PyTorch张量的数据结构和内存模型
- 掌握张量的创建、变换、索引和广播机制
- 理解自动微分系统的工作原理
- 学会梯度计算和反向传播的实现
- 掌握计算图的构建和优化技巧
2.1 张量深入理解
2.1.1 张量的内存模型
import torch
import numpy as np
import matplotlib.pyplot as plt
def tensor_memory_model():
"""张量内存模型详解"""
print("=== 张量内存模型 ===")
# 1. 张量的存储结构
print("1. 张量存储结构:")
x = torch.tensor([[1, 2, 3], [4, 5, 6]])
print(f"张量: \n{x}")
print(f"形状: {x.shape}")
print(f"步长: {x.stride()}")
print(f"存储偏移: {x.storage_offset()}")
print(f"是否连续: {x.is_contiguous()}")
# 2. 视图 vs 拷贝
print("\n2. 视图 vs 拷贝:")
# 视图操作(共享内存)
y = x.view(3, 2)
print(f"原张量: \n{x}")
print(f"视图张量: \n{y}")
print(f"共享存储: {x.storage().data_ptr() == y.storage().data_ptr()}")
# 修改原张量,视图也会改变
x[0, 0] = 100
print(f"修改后原张量: \n{x}")
print(f"修改后视图: \n{y}")
# 拷贝操作(独立内存)
z = x.clone()
x[0, 0] = 1
print(f"拷贝张量: \n{z}")
print(f"独立存储: {x.storage().data_ptr() != z.storage().data_ptr()}")
# 3. 内存布局
print("\n3. 内存布局:")
# 行优先 vs 列优先
row_major = torch.arange(12).reshape(3, 4)
col_major = row_major.t().contiguous().t()
print(f"行优先步长: {row_major.stride()}")
print(f"列优先步长: {col_major.stride()}")
print(f"行优先连续: {row_major.is_contiguous()}")
print(f"列优先连续: {col_major.is_contiguous()}")
# 运行内存模型演示
tensor_memory_model()
2.1.2 张量数据类型
def tensor_data_types():
"""张量数据类型详解"""
print("\n=== 张量数据类型 ===")
# 1. 基本数据类型
print("1. 基本数据类型:")
data_types = {
torch.float32: "32位浮点数 (默认)",
torch.float64: "64位浮点数",
torch.float16: "16位浮点数",
torch.int32: "32位整数",
torch.int64: "64位整数 (默认)",
torch.int8: "8位整数",
torch.uint8: "8位无符号整数",
torch.bool: "布尔类型"
}
for dtype, description in data_types.items():
x = torch.tensor([1.0], dtype=dtype)
print(f"{str(dtype):20} - {description:20} - 内存: {x.element_size()} bytes")
# 2. 类型转换
print("\n2. 类型转换:")
x = torch.randn(3, 3)
print(f"原始类型: {x.dtype}")
# 不同转换方法
x_int = x.int()
x_long = x.long()
x_float = x.float()
x_double = x.double()
x_half = x.half()
print(f"转换为int: {x_int.dtype}")
print(f"转换为long: {x_long.dtype}")
print(f"转换为float: {x_float.dtype}")
print(f"转换为double: {x_double.dtype}")
print(f"转换为half: {x_half.dtype}")
# 3. 类型提升
print("\n3. 自动类型提升:")
a = torch.tensor([1], dtype=torch.int32)
b = torch.tensor([2.0], dtype=torch.float32)
c = a + b
print(f"int32 + float32 = {c.dtype}")
# 4. 设备和类型
print("\n4. 设备和类型:")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x_cpu = torch.randn(3, 3, dtype=torch.float32)
x_gpu = x_cpu.to(device=device, dtype=torch.float16)
print(f"CPU张量: {x_cpu.device}, {x_cpu.dtype}")
print(f"GPU张量: {x_gpu.device}, {x_gpu.dtype}")
# 运行数据类型演示
tensor_data_types()
2.1.3 张量创建方法
def tensor_creation_methods():
"""张量创建方法大全"""
print("\n=== 张量创建方法 ===")
# 1. 从数据创建
print("1. 从数据创建:")
# 从列表创建
list_tensor = torch.tensor([[1, 2], [3, 4]])
print(f"从列表: \n{list_tensor}")
# 从NumPy数组创建
np_array = np.array([[1, 2], [3, 4]])
np_tensor = torch.from_numpy(np_array)
print(f"从NumPy: \n{np_tensor}")
# 从另一个张量创建
like_tensor = torch.zeros_like(list_tensor)
print(f"zeros_like: \n{like_tensor}")
# 2. 特殊值张量
print("\n2. 特殊值张量:")
zeros = torch.zeros(2, 3)
ones = torch.ones(2, 3)
full = torch.full((2, 3), 7)
eye = torch.eye(3)
print(f"zeros: \n{zeros}")
print(f"ones: \n{ones}")
print(f"full: \n{full}")
print(f"eye: \n{eye}")
# 3. 随机张量
print("\n3. 随机张量:")
# 设置随机种子
torch.manual_seed(42)
rand = torch.rand(2, 3) # [0, 1) 均匀分布
randn = torch.randn(2, 3) # 标准正态分布
randint = torch.randint(0, 10, (2, 3)) # 随机整数
randperm = torch.randperm(10) # 随机排列
print(f"rand: \n{rand}")
print(f"randn: \n{randn}")
print(f"randint: \n{randint}")
print(f"randperm: {randperm}")
# 4. 序列张量
print("\n4. 序列张量:")
arange = torch.arange(0, 10, 2)
linspace = torch.linspace(0, 1, 5)
logspace = torch.logspace(0, 2, 5)
print(f"arange: {arange}")
print(f"linspace: {linspace}")
print(f"logspace: {logspace}")
# 5. 高级创建方法
print("\n5. 高级创建方法:")
# 网格张量
x = torch.linspace(-1, 1, 3)
y = torch.linspace(-1, 1, 3)
grid_x, grid_y = torch.meshgrid(x, y, indexing='ij')
print(f"meshgrid x: \n{grid_x}")
print(f"meshgrid y: \n{grid_y}")
# 对角张量
diag = torch.diag(torch.tensor([1, 2, 3]))
print(f"diag: \n{diag}")
# 运行张量创建演示
tensor_creation_methods()
2.2 张量操作详解
2.2.1 形状变换操作
def tensor_shape_operations():
"""张量形状变换操作"""
print("\n=== 张量形状变换 ===")
# 创建示例张量
x = torch.arange(24).reshape(2, 3, 4)
print(f"原始张量形状: {x.shape}")
print(f"原始张量: \n{x}")
# 1. 基本变换
print("\n1. 基本变换:")
# reshape vs view
reshaped = x.reshape(6, 4)
viewed = x.view(6, 4)
print(f"reshape: {reshaped.shape}")
print(f"view: {viewed.shape}")
print(f"连续性: reshape={reshaped.is_contiguous()}, view={viewed.is_contiguous()}")
# flatten
flattened = x.flatten()
print(f"flatten: {flattened.shape}")
# squeeze 和 unsqueeze
y = torch.randn(1, 3, 1, 4)
squeezed = y.squeeze() # 移除大小为1的维度
unsqueezed = squeezed.unsqueeze(0) # 添加维度
print(f"原始: {y.shape}")
print(f"squeeze: {squeezed.shape}")
print(f"unsqueeze: {unsqueezed.shape}")
# 2. 维度操作
print("\n2. 维度操作:")
# transpose
transposed = x.transpose(0, 2)
print(f"transpose(0,2): {transposed.shape}")
# permute
permuted = x.permute(2, 0, 1)
print(f"permute(2,0,1): {permuted.shape}")
# 矩阵转置
matrix = torch.randn(3, 4)
print(f"矩阵转置: {matrix.shape} -> {matrix.t().shape}")
# 3. 高级变换
print("\n3. 高级变换:")
# chunk - 分块
chunks = x.chunk(2, dim=0)
print(f"chunk成{len(chunks)}块,每块形状: {[c.shape for c in chunks]}")
# split - 分割
splits = x.split([1, 1], dim=0)
print(f"split成{len(splits)}部分,每部分形状: {[s.shape for s in splits]}")
# repeat - 重复
repeated = torch.tensor([1, 2]).repeat(2, 3)
print(f"repeat: {repeated}")
# expand - 扩展
small = torch.tensor([[1], [2]])
expanded = small.expand(2, 3)
print(f"expand: \n{expanded}")
print(f"expand共享内存: {small.storage().data_ptr() == expanded.storage().data_ptr()}")
# 运行形状变换演示
tensor_shape_operations()
2.2.2 索引和切片
def tensor_indexing_slicing():
"""张量索引和切片操作"""
print("\n=== 张量索引和切片 ===")
# 创建示例张量
x = torch.arange(24).reshape(4, 6)
print(f"原始张量: \n{x}")
# 1. 基本索引
print("\n1. 基本索引:")
print(f"x[0]: {x[0]}") # 第一行
print(f"x[:, 0]: {x[:, 0]}") # 第一列
print(f"x[1, 3]: {x[1, 3]}") # 单个元素
print(f"x[1:3, 2:5]: \n{x[1:3, 2:5]}") # 切片
# 2. 高级索引
print("\n2. 高级索引:")
# 布尔索引
mask = x > 10
print(f"mask (x > 10): \n{mask}")
print(f"x[mask]: {x[mask]}")
# 整数数组索引
rows = torch.tensor([0, 2, 3])
cols = torch.tensor([1, 3, 5])
print(f"x[rows, cols]: {x[rows, cols]}")
# 3. 花式索引
print("\n3. 花式索引:")
# gather - 按索引收集
indices = torch.tensor([[0, 1, 2], [3, 4, 5]])
gathered = x.gather(1, indices)
print(f"gather结果: \n{gathered}")
# scatter - 按索引分散
y = torch.zeros(4, 6)
y.scatter_(1, indices, 100)
print(f"scatter结果: \n{y}")
# index_select - 按索引选择
selected = x.index_select(0, torch.tensor([0, 2]))
print(f"index_select结果: \n{selected}")
# 4. 条件索引
print("\n4. 条件索引:")
# where - 条件选择
condition = x > 15
result = torch.where(condition, x, torch.zeros_like(x))
print(f"where结果: \n{result}")
# masked_select - 掩码选择
masked = x.masked_select(x > 15)
print(f"masked_select结果: {masked}")
# nonzero - 非零元素索引
nonzero_indices = (x > 15).nonzero()
print(f"nonzero索引: \n{nonzero_indices}")
# 运行索引切片演示
tensor_indexing_slicing()
2.2.3 广播机制
def broadcasting_mechanism():
"""广播机制详解"""
print("\n=== 广播机制 ===")
# 1. 广播规则
print("1. 广播规则:")
print("- 从右向左比较维度")
print("- 维度大小相等或其中一个为1")
print("- 缺失维度视为1")
# 2. 广播示例
print("\n2. 广播示例:")
# 标量与张量
a = torch.tensor([[1, 2, 3], [4, 5, 6]])
b = 10
result1 = a + b
print(f"张量 + 标量:")
print(f"a: {a.shape} \n{a}")
print(f"b: 标量 {b}")
print(f"结果: \n{result1}")
# 不同形状张量
c = torch.tensor([[1], [2]]) # (2, 1)
d = torch.tensor([10, 20, 30]) # (3,)
result2 = c + d
print(f"\n不同形状张量:")
print(f"c: {c.shape} \n{c}")
print(f"d: {d.shape} {d}")
print(f"结果: {result2.shape} \n{result2}")
# 3. 广播可视化
print("\n3. 广播过程可视化:")
def visualize_broadcast(x, y):
print(f"原始形状: x{x.shape}, y{y.shape}")
# 检查是否可以广播
try:
result = x + y
print(f"广播后形状: {result.shape}")
print(f"广播成功 ✓")
return result
except RuntimeError as e:
print(f"广播失败 ✗: {e}")
return None
# 可广播的例子
x1 = torch.randn(3, 1, 4)
y1 = torch.randn(1, 2, 1)
visualize_broadcast(x1, y1)
# 不可广播的例子
x2 = torch.randn(3, 4)
y2 = torch.randn(2, 3)
visualize_broadcast(x2, y2)
# 4. 手动广播
print("\n4. 手动广播:")
a = torch.tensor([[1, 2]]) # (1, 2)
b = torch.tensor([[3], [4]]) # (2, 1)
# 使用expand手动广播
a_expanded = a.expand(2, 2)
b_expanded = b.expand(2, 2)
print(f"a原始: {a.shape} \n{a}")
print(f"a扩展: {a_expanded.shape} \n{a_expanded}")
print(f"b原始: {b.shape} \n{b}")
print(f"b扩展: {b_expanded.shape} \n{b_expanded}")
print(f"手动广播结果: \n{a_expanded + b_expanded}")
# 5. 广播性能考虑
print("\n5. 广播性能:")
import time
# 大张量广播测试
large_a = torch.randn(1000, 1)
large_b = torch.randn(1, 1000)
start = time.time()
for _ in range(100):
result = large_a + large_b
broadcast_time = time.time() - start
# 预先扩展
expanded_a = large_a.expand(1000, 1000)
expanded_b = large_b.expand(1000, 1000)
start = time.time()
for _ in range(100):
result = expanded_a + expanded_b
expanded_time = time.time() - start
print(f"广播时间: {broadcast_time:.4f}s")
print(f"预扩展时间: {expanded_time:.4f}s")
# 运行广播机制演示
broadcasting_mechanism()
2.3 数学运算
2.3.1 基本数学运算
def basic_math_operations():
"""基本数学运算"""
print("\n=== 基本数学运算 ===")
# 创建示例张量
a = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
b = torch.tensor([[5.0, 6.0], [7.0, 8.0]])
print(f"a: \n{a}")
print(f"b: \n{b}")
# 1. 算术运算
print("\n1. 算术运算:")
print(f"加法 a + b: \n{a + b}")
print(f"减法 a - b: \n{a - b}")
print(f"乘法 a * b: \n{a * b}")
print(f"除法 a / b: \n{a / b}")
print(f"幂运算 a ** 2: \n{a ** 2}")
print(f"取模 a % 3: \n{a % 3}")
# 2. 就地运算
print("\n2. 就地运算:")
c = a.clone()
print(f"原始 c: \n{c}")
c.add_(1) # 就地加法
print(f"c.add_(1): \n{c}")
c.mul_(2) # 就地乘法
print(f"c.mul_(2): \n{c}")
# 3. 比较运算
print("\n3. 比较运算:")
print(f"a > 2: \n{a > 2}")
print(f"a == b: \n{a == b}")
print(f"torch.eq(a, 2): \n{torch.eq(a, 2)}")
print(f"torch.gt(a, 2): \n{torch.gt(a, 2)}")
# 4. 逻辑运算
print("\n4. 逻辑运算:")
mask1 = a > 2
mask2 = b < 7
print(f"mask1 (a > 2): \n{mask1}")
print(f"mask2 (b < 7): \n{mask2}")
print(f"mask1 & mask2: \n{mask1 & mask2}")
print(f"mask1 | mask2: \n{mask1 | mask2}")
print(f"~mask1: \n{~mask1}")
# 运行基本数学运算演示
basic_math_operations()
2.3.2 线性代数运算
def linear_algebra_operations():
"""线性代数运算"""
print("\n=== 线性代数运算 ===")
# 1. 矩阵乘法
print("1. 矩阵乘法:")
A = torch.randn(3, 4)
B = torch.randn(4, 5)
# 不同的矩阵乘法方式
C1 = torch.mm(A, B) # 2D矩阵乘法
C2 = A @ B # 运算符重载
C3 = torch.matmul(A, B) # 通用矩阵乘法
print(f"A: {A.shape}")
print(f"B: {B.shape}")
print(f"C: {C1.shape}")
print(f"结果一致: {torch.allclose(C1, C2) and torch.allclose(C2, C3)}")
# 批量矩阵乘法
batch_A = torch.randn(10, 3, 4)
batch_B = torch.randn(10, 4, 5)
batch_C = torch.bmm(batch_A, batch_B)
print(f"批量矩阵乘法: {batch_A.shape} @ {batch_B.shape} = {batch_C.shape}")
# 2. 向量运算
print("\n2. 向量运算:")
u = torch.tensor([1.0, 2.0, 3.0])
v = torch.tensor([4.0, 5.0, 6.0])
# 点积
dot_product = torch.dot(u, v)
print(f"点积 u·v: {dot_product}")
# 外积
outer_product = torch.outer(u, v)
print(f"外积 u⊗v: \n{outer_product}")
# 叉积 (3D向量)
cross_product = torch.cross(u, v)
print(f"叉积 u×v: {cross_product}")
# 3. 矩阵分解
print("\n3. 矩阵分解:")
# 创建对称正定矩阵
X = torch.randn(4, 4)
A = X @ X.t() + torch.eye(4) # 确保正定
# 特征值分解
eigenvals, eigenvecs = torch.linalg.eig(A)
print(f"特征值: {eigenvals.real}")
# SVD分解
U, S, Vh = torch.linalg.svd(A)
print(f"SVD: U{U.shape}, S{S.shape}, Vh{Vh.shape}")
# Cholesky分解
L = torch.linalg.cholesky(A)
print(f"Cholesky分解验证: {torch.allclose(L @ L.t(), A)}")
# QR分解
Q, R = torch.linalg.qr(A)
print(f"QR分解验证: {torch.allclose(Q @ R, A)}")
# 4. 矩阵性质
print("\n4. 矩阵性质:")
# 行列式
det = torch.linalg.det(A)
print(f"行列式: {det}")
# 矩阵的秩
rank = torch.linalg.matrix_rank(A)
print(f"矩阵的秩: {rank}")
# 条件数
cond = torch.linalg.cond(A)
print(f"条件数: {cond}")
# 矩阵范数
frobenius_norm = torch.linalg.matrix_norm(A, 'fro')
spectral_norm = torch.linalg.matrix_norm(A, 2)
print(f"Frobenius范数: {frobenius_norm}")
print(f"谱范数: {spectral_norm}")
# 5. 求解线性方程组
print("\n5. 线性方程组求解:")
# Ax = b
b = torch.randn(4)
x = torch.linalg.solve(A, b)
print(f"方程组解验证: {torch.allclose(A @ x, b)}")
# 最小二乘解
A_rect = torch.randn(6, 4) # 超定系统
b_rect = torch.randn(6)
x_lstsq = torch.linalg.lstsq(A_rect, b_rect).solution
print(f"最小二乘解形状: {x_lstsq.shape}")
# 运行线性代数运算演示
linear_algebra_operations()
2.3.3 统计和聚合运算
def statistical_operations():
"""统计和聚合运算"""
print("\n=== 统计和聚合运算 ===")
# 创建示例数据
torch.manual_seed(42)
data = torch.randn(4, 5)
print(f"数据: \n{data}")
# 1. 基本统计量
print("\n1. 基本统计量:")
print(f"均值: {data.mean()}")
print(f"标准差: {data.std()}")
print(f"方差: {data.var()}")
print(f"最小值: {data.min()}")
print(f"最大值: {data.max()}")
print(f"中位数: {data.median()}")
# 按维度统计
print(f"\n按行统计 (dim=1):")
print(f"行均值: {data.mean(dim=1)}")
print(f"行标准差: {data.std(dim=1)}")
print(f"\n按列统计 (dim=0):")
print(f"列均值: {data.mean(dim=0)}")
print(f"列标准差: {data.std(dim=0)}")
# 2. 聚合运算
print("\n2. 聚合运算:")
print(f"求和: {data.sum()}")
print(f"乘积: {data.prod()}")
print(f"累积和: {data.cumsum(dim=0)}")
print(f"累积乘积: {data.cumprod(dim=1)}")
# 3. 排序和排名
print("\n3. 排序和排名:")
# 排序
sorted_data, indices = data.sort(dim=1)
print(f"按行排序: \n{sorted_data}")
print(f"排序索引: \n{indices}")
# topk
values, indices = data.topk(3, dim=1)
print(f"每行top3值: \n{values}")
print(f"每行top3索引: \n{indices}")
# 4. 分位数
print("\n4. 分位数:")
print(f"25%分位数: {data.quantile(0.25)}")
print(f"50%分位数: {data.quantile(0.5)}")
print(f"75%分位数: {data.quantile(0.75)}")
# 5. 直方图
print("\n5. 直方图:")
hist = torch.histc(data.flatten(), bins=10, min=-3, max=3)
print(f"直方图: {hist}")
# 6. 相关性分析
print("\n6. 相关性分析:")
# 协方差矩阵
cov_matrix = torch.cov(data.t()) # 转置后计算列之间的协方差
print(f"协方差矩阵: \n{cov_matrix}")
# 相关系数矩阵
corr_matrix = torch.corrcoef(data.t())
print(f"相关系数矩阵: \n{corr_matrix}")
# 运行统计运算演示
statistical_operations()
2.4 自动微分系统
2.4.1 自动微分基础
def autograd_basics():
"""自动微分基础"""
print("\n=== 自动微分基础 ===")
# 1. 启用梯度追踪
print("1. 梯度追踪:")
# 创建需要梯度的张量
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)
print(f"x: {x}, requires_grad: {x.requires_grad}")
print(f"y: {y}, requires_grad: {y.requires_grad}")
# 计算函数值
z = x**2 + 2*x*y + y**2
print(f"z = x² + 2xy + y² = {z}")
# 反向传播
z.backward()
print(f"∂z/∂x = {x.grad}") # 应该是 2x + 2y = 2*2 + 2*3 = 10
print(f"∂z/∂y = {y.grad}") # 应该是 2x + 2y = 2*2 + 2*3 = 10
# 2. 梯度累积
print("\n2. 梯度累积:")
x.grad.zero_() # 清零梯度
y.grad.zero_()
# 第一次计算
z1 = x**2
z1.backward()
print(f"第一次: ∂(x²)/∂x = {x.grad}")
# 第二次计算(不清零梯度)
z2 = 3*x
z2.backward()
print(f"累积后: ∂(x² + 3x)/∂x = {x.grad}")
# 3. 计算图可视化
print("\n3. 计算图:")
x = torch.tensor([1.0], requires_grad=True)
a = x + 2
b = a * 3
c = b**2
print(f"x -> a=x+2 -> b=a*3 -> c=b²")
print(f"x={x.item()}, a={a.item()}, b={b.item()}, c={c.item()}")
c.backward()
print(f"∂c/∂x = {x.grad}") # 链式法则: 2b * 3 * 1 = 2*9*3 = 54
# 运行自动微分基础演示
autograd_basics()
2.4.2 高级自动微分
def advanced_autograd():
"""高级自动微分技术"""
print("\n=== 高级自动微分 ===")
# 1. 高阶导数
print("1. 高阶导数:")
x = torch.tensor([2.0], requires_grad=True)
y = x**4 + 2*x**3 + x**2
# 一阶导数
grad1 = torch.autograd.grad(y, x, create_graph=True)[0]
print(f"f(x) = x⁴ + 2x³ + x²")
print(f"f'(x) = {grad1}")
# 二阶导数
grad2 = torch.autograd.grad(grad1, x, create_graph=True)[0]
print(f"f''(x) = {grad2}")
# 三阶导数
grad3 = torch.autograd.grad(grad2, x)[0]
print(f"f'''(x) = {grad3}")
# 2. 雅可比矩阵
print("\n2. 雅可比矩阵:")
def vector_function(x):
"""向量函数 f: R² -> R²"""
return torch.stack([
x[0]**2 + x[1],
x[0] * x[1]**2
])
x = torch.tensor([2.0, 3.0], requires_grad=True)
y = vector_function(x)
# 计算雅可比矩阵
jacobian = torch.zeros(2, 2)
for i in range(2):
grad = torch.autograd.grad(y[i], x, retain_graph=True)[0]
jacobian[i] = grad
print(f"输入: {x}")
print(f"输出: {y}")
print(f"雅可比矩阵: \n{jacobian}")
# 3. 海塞矩阵
print("\n3. 海塞矩阵:")
def scalar_function(x):
"""标量函数 f: R² -> R"""
return x[0]**2 * x[1] + x[1]**3
x = torch.tensor([1.0, 2.0], requires_grad=True)
y = scalar_function(x)
# 计算海塞矩阵
grad = torch.autograd.grad(y, x, create_graph=True)[0]
hessian = torch.zeros(2, 2)
for i in range(2):
grad2 = torch.autograd.grad(grad[i], x, retain_graph=True)[0]
hessian[i] = grad2
print(f"函数值: {y}")
print(f"梯度: {grad}")
print(f"海塞矩阵: \n{hessian}")
# 4. 函数式API
print("\n4. 函数式API:")
# torch.func.jacrev - 雅可比矩阵(反向模式)
try:
import torch.func as func
def f(x):
return x**2 + torch.sin(x)
x = torch.tensor([1.0, 2.0])
jac = func.jacrev(f)(x)
print(f"函数式雅可比: {jac}")
except ImportError:
print("torch.func 不可用,跳过函数式API演示")
# 运行高级自动微分演示
advanced_autograd()
2.4.3 梯度控制技术
def gradient_control():
"""梯度控制技术"""
print("\n=== 梯度控制技术 ===")
# 1. 梯度上下文管理
print("1. 梯度上下文管理:")
x = torch.tensor([1.0], requires_grad=True)
# 正常计算(有梯度)
y1 = x**2
print(f"正常计算: y1.requires_grad = {y1.requires_grad}")
# 无梯度计算
with torch.no_grad():
y2 = x**2
print(f"no_grad: y2.requires_grad = {y2.requires_grad}")
# 推理模式
with torch.inference_mode():
y3 = x**2
print(f"inference_mode: y3.requires_grad = {y3.requires_grad}")
# 2. 梯度检查点
print("\n2. 梯度检查点:")
def expensive_function(x):
"""模拟计算昂贵的函数"""
for _ in range(1000):
x = torch.sin(x)
return x
x = torch.tensor([1.0], requires_grad=True)
# 使用检查点节省内存
y = torch.utils.checkpoint.checkpoint(expensive_function, x)
y.backward()
print(f"检查点梯度: {x.grad}")
# 3. 梯度裁剪
print("\n3. 梯度裁剪:")
# 创建一个简单模型
import torch.nn as nn
model = nn.Linear(10, 1)
x = torch.randn(5, 10)
y = torch.randn(5, 1)
# 计算损失和梯度
loss = nn.MSELoss()(model(x), y)
loss.backward()
# 梯度裁剪前
grad_norm_before = torch.nn.utils.clip_grad_norm_(model.parameters(), float('inf'))
print(f"裁剪前梯度范数: {grad_norm_before}")
# 重新计算梯度
model.zero_grad()
loss.backward()
# 梯度裁剪
max_norm = 1.0
grad_norm_after = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
print(f"裁剪后梯度范数: {grad_norm_after}")
# 4. 自定义梯度函数
print("\n4. 自定义梯度函数:")
class CustomFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, x):
# 保存用于反向传播的张量
ctx.save_for_backward(x)
return x**2
@staticmethod
def backward(ctx, grad_output):
# 获取保存的张量
x, = ctx.saved_tensors
# 返回输入的梯度
return grad_output * 2 * x
# 使用自定义函数
custom_func = CustomFunction.apply
x = torch.tensor([3.0], requires_grad=True)
y = custom_func(x)
y.backward()
print(f"自定义函数梯度: {x.grad}")
# 5. 梯度钩子
print("\n5. 梯度钩子:")
def gradient_hook(grad):
print(f"梯度钩子捕获: {grad}")
return grad * 2 # 修改梯度
x = torch.tensor([2.0], requires_grad=True)
x.register_hook(gradient_hook)
y = x**3
y.backward()
print(f"钩子修改后梯度: {x.grad}")
# 运行梯度控制演示
gradient_control()
2.5 计算图优化
2.5.1 计算图分析
def computational_graph_analysis():
"""计算图分析"""
print("\n=== 计算图分析 ===")
# 1. 计算图构建
print("1. 计算图构建:")
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)
# 构建复杂计算图
a = x + y
b = x * y
c = a / b
d = torch.sin(c)
e = d**2
print(f"计算图: x,y -> a=x+y, b=x*y -> c=a/b -> d=sin(c) -> e=d²")
print(f"最终结果: {e}")
# 2. 计算图可视化信息
print("\n2. 计算图信息:")
def print_graph_info(tensor, name):
if tensor.grad_fn is not None:
print(f"{name}: {tensor.grad_fn}")
print(f" 输入: {[inp[0].grad_fn if inp[0].grad_fn else 'leaf' for inp in tensor.grad_fn.next_functions]}")
print_graph_info(a, "a")
print_graph_info(b, "b")
print_graph_info(c, "c")
print_graph_info(d, "d")
print_graph_info(e, "e")
# 3. 叶子节点检查
print("\n3. 叶子节点:")
for name, tensor in [("x", x), ("y", y), ("a", a), ("e", e)]:
print(f"{name}: is_leaf={tensor.is_leaf}, requires_grad={tensor.requires_grad}")
# 4. 计算图释放
print("\n4. 计算图管理:")
# 保留计算图
e.backward(retain_graph=True)
print(f"第一次反向传播: x.grad={x.grad}, y.grad={y.grad}")
# 再次反向传播(需要retain_graph=True)
x.grad.zero_()
y.grad.zero_()
e.backward()
print(f"第二次反向传播: x.grad={x.grad}, y.grad={y.grad}")
# 运行计算图分析演示
computational_graph_analysis()
2.5.2 内存优化技术
def memory_optimization():
"""内存优化技术"""
print("\n=== 内存优化技术 ===")
# 1. 就地操作
print("1. 就地操作:")
x = torch.randn(1000, 1000)
print(f"原始内存地址: {x.data_ptr()}")
# 非就地操作(创建新张量)
y = x + 1
print(f"非就地操作地址: {y.data_ptr()}")
print(f"地址相同: {x.data_ptr() == y.data_ptr()}")
# 就地操作(修改原张量)
x.add_(1)
print(f"就地操作后地址: {x.data_ptr()}")
# 2. 内存池管理
print("\n2. GPU内存管理:")
if torch.cuda.is_available():
device = torch.device("cuda")
# 清空缓存
torch.cuda.empty_cache()
print(f"清空后GPU内存: {torch.cuda.memory_allocated() / 1e6:.1f} MB")
# 分配大张量
large_tensor = torch.randn(1000, 1000, device=device)
print(f"分配后GPU内存: {torch.cuda.memory_allocated() / 1e6:.1f} MB")
# 删除张量
del large_tensor
print(f"删除后GPU内存: {torch.cuda.memory_allocated() / 1e6:.1f} MB")
# 清空缓存
torch.cuda.empty_cache()
print(f"清空缓存后GPU内存: {torch.cuda.memory_allocated() / 1e6:.1f} MB")
# 3. 梯度累积
print("\n3. 梯度累积:")
# 模拟大批量训练
model = torch.nn.Linear(100, 10)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
# 传统方法(大批量)
large_batch = torch.randn(1000, 100)
large_target = torch.randn(1000, 10)
# 梯度累积方法(小批量)
accumulation_steps = 10
small_batch_size = 100
optimizer.zero_grad()
for i in range(accumulation_steps):
start_idx = i * small_batch_size
end_idx = start_idx + small_batch_size
small_batch = large_batch[start_idx:end_idx]
small_target = large_target[start_idx:end_idx]
output = model(small_batch)
loss = criterion(output, small_target)
loss = loss / accumulation_steps # 平均损失
loss.backward()
optimizer.step()
print("梯度累积完成")
# 4. 混合精度训练
print("\n4. 混合精度训练:")
if torch.cuda.is_available():
# 自动混合精度
scaler = torch.cuda.amp.GradScaler()
model = model.cuda()
x = torch.randn(32, 100, device='cuda')
target = torch.randn(32, 10, device='cuda')
optimizer.zero_grad()
with torch.cuda.amp.autocast():
output = model(x)
loss = criterion(output, target)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
print("混合精度训练完成")
# 运行内存优化演示
memory_optimization()
2.6 实践案例
2.6.1 自定义激活函数
def custom_activation_function():
"""自定义激活函数案例"""
print("\n=== 自定义激活函数 ===")
# 1. 使用Function类定义
class Swish(torch.autograd.Function):
@staticmethod
def forward(ctx, x):
# Swish: x * sigmoid(x)
sigmoid_x = torch.sigmoid(x)
output = x * sigmoid_x
# 保存用于反向传播的值
ctx.save_for_backward(x, sigmoid_x, output)
return output
@staticmethod
def backward(ctx, grad_output):
# 获取保存的值
x, sigmoid_x, output = ctx.saved_tensors
# 计算导数: d/dx[x * sigmoid(x)] = sigmoid(x) + x * sigmoid(x) * (1 - sigmoid(x))
grad_input = sigmoid_x + output * (1 - sigmoid_x)
return grad_output * grad_input
# 2. 测试自定义激活函数
swish = Swish.apply
x = torch.linspace(-3, 3, 100, requires_grad=True)
y = swish(x)
# 计算梯度
grad_outputs = torch.ones_like(y)
x_grad = torch.autograd.grad(y, x, grad_outputs)[0]
# 3. 与PyTorch内置版本比较
y_builtin = x * torch.sigmoid(x)
x_grad_builtin = torch.autograd.grad(y_builtin, x, grad_outputs)[0]
print(f"自定义版本与内置版本一致: {torch.allclose(y, y_builtin)}")
print(f"梯度一致: {torch.allclose(x_grad, x_grad_builtin)}")
# 4. 可视化
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(x.detach().numpy(), y.detach().numpy(), label='Swish')
plt.plot(x.detach().numpy(), torch.relu(x).detach().numpy(), label='ReLU')
plt.plot(x.detach().numpy(), torch.tanh(x).detach().numpy(), label='Tanh')
plt.title('Activation Functions')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.legend()
plt.grid(True)
plt.subplot(1, 2, 2)
plt.plot(x.detach().numpy(), x_grad.detach().numpy(), label="Swish'")
plt.title('Derivatives')
plt.xlabel('x')
plt.ylabel("f'(x)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# 运行自定义激活函数演示
custom_activation_function()
2.6.2 数值优化问题
def numerical_optimization():
"""数值优化问题案例"""
print("\n=== 数值优化问题 ===")
# 1. 函数最小化
print("1. 函数最小化:")
# 目标函数: f(x, y) = (x-1)² + (y-2)² + sin(x*y)
def objective_function(params):
x, y = params
return (x - 1)**2 + (y - 2)**2 + torch.sin(x * y)
# 初始化参数
params = torch.tensor([0.0, 0.0], requires_grad=True)
optimizer = torch.optim.Adam([params], lr=0.1)
# 优化过程
history = []
for i in range(100):
optimizer.zero_grad()
loss = objective_function(params)
loss.backward()
optimizer.step()
history.append(loss.item())
if i % 20 == 0:
print(f"Step {i}: loss={loss.item():.6f}, params={params.detach().numpy()}")
print(f"最终结果: x={params[0].item():.4f}, y={params[1].item():.4f}")
# 2. 约束优化
print("\n2. 约束优化:")
# 目标: 最小化 x² + y² 约束于 x + y = 1
def constrained_objective(params):
x, y = params
objective = x**2 + y**2
constraint = (x + y - 1)**2 # 惩罚项
return objective + 10 * constraint # 拉格朗日乘数法的简化版本
params = torch.tensor([0.0, 0.0], requires_grad=True)
optimizer = torch.optim.LBFGS([params])
def closure():
optimizer.zero_grad()
loss = constrained_objective(params)
loss.backward()
return loss
for i in range(10):
loss = optimizer.step(closure)
if i % 2 == 0:
x, y = params.detach().numpy()
print(f"Step {i}: x={x:.4f}, y={y:.4f}, constraint={x+y:.4f}")
# 3. 多变量函数拟合
print("\n3. 多变量函数拟合:")
# 生成数据: z = sin(x) * cos(y) + noise
torch.manual_seed(42)
n_points = 100
x_data = torch.rand(n_points) * 4 - 2 # [-2, 2]
y_data = torch.rand(n_points) * 4 - 2 # [-2, 2]
z_true = torch.sin(x_data) * torch.cos(y_data)
z_data = z_true + 0.1 * torch.randn(n_points) # 添加噪声
# 定义拟合函数: z = a*sin(b*x) * c*cos(d*y) + e
params = torch.tensor([1.0, 1.0, 1.0, 1.0, 0.0], requires_grad=True)
def fitted_function(x, y, params):
a, b, c, d, e = params
return a * torch.sin(b * x) * c * torch.cos(d * y) + e
optimizer = torch.optim.Adam([params], lr=0.01)
for i in range(1000):
optimizer.zero_grad()
z_pred = fitted_function(x_data, y_data, params)
loss = torch.mean((z_pred - z_data)**2)
loss.backward()
optimizer.step()
if i % 200 == 0:
print(f"Epoch {i}: MSE={loss.item():.6f}")
print(f"拟合参数: {params.detach().numpy()}")
print(f"真实参数: [1, 1, 1, 1, 0]")
# 运行数值优化演示
numerical_optimization()
2.7 本章总结
2.7.1 关键概念回顾
def chapter_summary():
"""第2章关键概念总结"""
print("\n📚 第2章总结:张量操作与自动微分")
print("=" * 50)
key_concepts = {
"张量基础": [
"内存模型和存储结构",
"数据类型和设备管理",
"视图vs拷贝机制",
"张量创建的多种方法"
],
"张量操作": [
"形状变换和维度操作",
"索引、切片和花式索引",
"广播机制和自动扩展",
"数学运算和线性代数"
],
"自动微分": [
"计算图的构建和管理",
"梯度计算和反向传播",
"高阶导数和雅可比矩阵",
"梯度控制和优化技术"
],
"性能优化": [
"内存管理和就地操作",
"梯度累积和混合精度",
"计算图优化策略",
"自定义函数和钩子"
]
}
for category, items in key_concepts.items():
print(f"\n{category}:")
for item in items:
print(f" • {item}")
print(f"\n🎯 学习成果:")
achievements = [
"✅ 深入理解张量的内存模型和数据结构",
"✅ 掌握各种张量操作和变换技巧",
"✅ 理解广播机制和高级索引方法",
"✅ 掌握自动微分系统的工作原理",
"✅ 学会梯度计算和反向传播实现",
"✅ 了解计算图优化和内存管理技术"
]
for achievement in achievements:
print(f" {achievement}")
# 显示章节总结
chapter_summary()
2.7.2 下一章预告
def next_chapter_preview():
"""下一章内容预告"""
print(f"\n🔮 下一章预告:神经网络基础")
print("=" * 40)
preview = [
"🧠 nn.Module类的设计原理",
"🔧 神经网络层的定义和使用",
"➡️ 前向传播的实现机制",
"🔄 参数初始化和管理",
"📊 损失函数和激活函数",
"🏗️ 构建复杂网络架构"
]
for item in preview:
print(f" {item}")
print(f"\n💡 学习建议:")
suggestions = [
"熟练掌握张量的各种操作",
"理解自动微分的工作原理",
"多练习梯度计算和反向传播",
"准备线性代数和微积分基础"
]
for suggestion in suggestions:
print(f" • {suggestion}")
# 显示下一章预告
next_chapter_preview()
2.7.3 练习题
def practice_exercises():
"""本章练习题"""
print(f"\n📝 本章练习题")
print("=" * 30)
exercises = {
"基础练习": [
"创建不同类型和形状的张量",
"实现张量的各种变换操作",
"练习广播机制和索引操作",
"计算简单函数的梯度"
],
"进阶练习": [
"实现自定义激活函数",
"计算多变量函数的雅可比矩阵",
"使用自动微分求解优化问题",
"分析计算图的内存使用"
],
"项目练习": [
"实现数值优化算法",
"构建自定义的自动微分系统",
"优化大规模张量运算的性能",
"实现混合精度训练框架"
]
}
for level, tasks in exercises.items():
print(f"\n{level}:")
for i, task in enumerate(tasks, 1):
print(f" {i}. {task}")
print(f"\n🎯 完成建议:")
print(" • 从基础练习开始,逐步提高难度")
print(" • 每个练习都要动手编程实现")
print(" • 理解底层原理,不只是使用API")
print(" • 关注性能优化和内存管理")
# 显示练习题
practice_exercises()
2.7.4 代码示例总结
def code_examples_summary():
"""代码示例总结"""
print(f"\n💻 本章代码示例总结")
print("=" * 40)
examples = {
"张量操作": [
"tensor_memory_model() - 内存模型演示",
"tensor_data_types() - 数据类型转换",
"tensor_creation_methods() - 创建方法大全",
"tensor_shape_operations() - 形状变换",
"tensor_indexing_slicing() - 索引切片",
"broadcasting_mechanism() - 广播机制"
],
"数学运算": [
"basic_math_operations() - 基本运算",
"linear_algebra_operations() - 线性代数",
"statistical_operations() - 统计运算"
],
"自动微分": [
"autograd_basics() - 基础概念",
"advanced_autograd() - 高级技术",
"gradient_control() - 梯度控制",
"computational_graph_analysis() - 计算图分析"
],
"优化技术": [
"memory_optimization() - 内存优化",
"custom_activation_function() - 自定义函数",
"numerical_optimization() - 数值优化"
]
}
for category, funcs in examples.items():
print(f"\n{category}:")
for func in funcs:
print(f" • {func}")
print(f"\n📖 学习资源:")
resources = [
"PyTorch官方文档 - 张量操作",
"PyTorch官方教程 - 自动微分",
"《深度学习》- Goodfellow等著",
"《PyTorch深度学习实战》"
]
for resource in resources:
print(f" • {resource}")
# 显示代码示例总结
code_examples_summary()
🎉 恭喜完成第2章!
您已经成功完成了张量操作与自动微分的学习。现在您应该:
- ✅ 深入理解PyTorch张量的内存模型和数据结构
- ✅ 掌握各种张量操作、变换和索引技巧
- ✅ 理解广播机制和高级数学运算
- ✅ 掌握自动微分系统的工作原理
- ✅ 学会梯度计算、反向传播和计算图优化
- ✅ 了解内存管理和性能优化技术
下一步:继续学习第3章:神经网络基础,学习如何使用PyTorch构建和训练神经网络。
重要提示: - 张量操作是PyTorch的基础,务必熟练掌握 - 自动微分是深度学习的核心,理解其原理很重要 - 多做练习,特别是梯度计算和性能优化 - 关注内存使用,这在实际项目中非常重要
继续加油!🚀