第2章：数据结构详解（Series和DataFrame）

2.1 Series详解

2.1.1 Series基础概念

Series是pandas中的一维标记数组，可以存储任何数据类型（整数、字符串、浮点数、Python对象等）。它类似于NumPy数组，但具有标签索引功能。

import pandas as pd
import numpy as np

# Series的基本结构
print("Series的基本结构:")
print("值(values) + 索引(index) = Series")

# 创建简单的Series
s = pd.Series([1, 2, 3, 4, 5])
print(f"\n基础Series:")
print(s)
print(f"值: {s.values}")
print(f"索引: {s.index}")
print(f"数据类型: {s.dtype}")

2.1.2 Series创建方法

从列表创建

def create_series_from_list():
    """从列表创建Series的各种方法"""
    
    # 1. 基础列表
    s1 = pd.Series([10, 20, 30, 40, 50])
    print("1. 从基础列表创建:")
    print(s1)
    
    # 2. 指定索引
    s2 = pd.Series([10, 20, 30, 40, 50], 
                   index=['a', 'b', 'c', 'd', 'e'])
    print("\n2. 指定索引:")
    print(s2)
    
    # 3. 指定数据类型
    s3 = pd.Series([1, 2, 3, 4, 5], dtype='float64')
    print("\n3. 指定数据类型:")
    print(s3)
    
    # 4. 指定名称
    s4 = pd.Series([100, 200, 300], 
                   index=['Jan', 'Feb', 'Mar'],
                   name='Sales')
    print("\n4. 指定名称:")
    print(s4)
    
    return s1, s2, s3, s4

# 运行示例
series_list = create_series_from_list()

从字典创建

def create_series_from_dict():
    """从字典创建Series"""
    
    # 1. 基础字典
    data_dict = {'apple': 5, 'banana': 3, 'orange': 8, 'grape': 2}
    s1 = pd.Series(data_dict)
    print("1. 从字典创建:")
    print(s1)
    
    # 2. 指定索引顺序
    s2 = pd.Series(data_dict, index=['banana', 'apple', 'grape', 'orange'])
    print("\n2. 指定索引顺序:")
    print(s2)
    
    # 3. 部分索引（会产生NaN）
    s3 = pd.Series(data_dict, index=['apple', 'banana', 'mango'])
    print("\n3. 部分索引:")
    print(s3)
    
    # 4. 嵌套字典
    nested_dict = {
        'Q1': {'sales': 1000, 'profit': 200},
        'Q2': {'sales': 1200, 'profit': 250},
        'Q3': {'sales': 1100, 'profit': 220}
    }
    
    # 从嵌套字典创建Series
    sales_series = pd.Series({k: v['sales'] for k, v in nested_dict.items()})
    profit_series = pd.Series({k: v['profit'] for k, v in nested_dict.items()})
    
    print("\n4. 从嵌套字典创建:")
    print("销售额Series:")
    print(sales_series)
    print("\n利润Series:")
    print(profit_series)
    
    return s1, s2, s3, sales_series, profit_series

# 运行示例
series_dict = create_series_from_dict()

从NumPy数组创建

def create_series_from_numpy():
    """从NumPy数组创建Series"""
    
    # 1. 一维数组
    arr1 = np.array([1, 2, 3, 4, 5])
    s1 = pd.Series(arr1)
    print("1. 从一维数组创建:")
    print(s1)
    
    # 2. 随机数组
    np.random.seed(42)
    arr2 = np.random.randn(5)
    s2 = pd.Series(arr2, index=['A', 'B', 'C', 'D', 'E'])
    print("\n2. 从随机数组创建:")
    print(s2)
    
    # 3. 特殊数组
    arr3 = np.linspace(0, 10, 6)
    s3 = pd.Series(arr3, name='Linear_Space')
    print("\n3. 从线性空间数组创建:")
    print(s3)
    
    # 4. 布尔数组
    arr4 = np.array([True, False, True, False, True])
    s4 = pd.Series(arr4, index=['Mon', 'Tue', 'Wed', 'Thu', 'Fri'])
    print("\n4. 从布尔数组创建:")
    print(s4)
    
    return s1, s2, s3, s4

# 运行示例
series_numpy = create_series_from_numpy()

2.1.3 Series索引操作

基础索引

def series_indexing_demo():
    """Series索引操作演示"""
    
    # 创建示例Series
    fruits = pd.Series([5, 3, 8, 2, 6], 
                      index=['apple', 'banana', 'orange', 'grape', 'mango'],
                      name='quantity')
    
    print("示例Series:")
    print(fruits)
    
    # 1. 位置索引
    print(f"\n1. 位置索引:")
    print(f"第一个元素: {fruits[0]}")
    print(f"最后一个元素: {fruits[-1]}")
    print(f"前三个元素:\n{fruits[:3]}")
    
    # 2. 标签索引
    print(f"\n2. 标签索引:")
    print(f"apple的数量: {fruits['apple']}")
    print(f"banana的数量: {fruits['banana']}")
    
    # 3. 多个索引
    print(f"\n3. 多个索引:")
    print(f"apple和orange:\n{fruits[['apple', 'orange']]}")
    
    # 4. 切片操作
    print(f"\n4. 切片操作:")
    print(f"从banana到grape:\n{fruits['banana':'grape']}")
    
    # 5. 布尔索引
    print(f"\n5. 布尔索引:")
    print(f"数量大于4的水果:\n{fruits[fruits > 4]}")
    
    # 6. isin方法
    print(f"\n6. isin方法:")
    selected_fruits = fruits[fruits.index.isin(['apple', 'mango'])]
    print(f"选择apple和mango:\n{selected_fruits}")
    
    return fruits

# 运行示例
fruits_series = series_indexing_demo()

高级索引

def advanced_series_indexing():
    """Series高级索引操作"""
    
    # 创建时间序列数据
    dates = pd.date_range('2024-01-01', periods=10, freq='D')
    ts = pd.Series(np.random.randn(10), index=dates, name='daily_returns')
    
    print("时间序列数据:")
    print(ts)
    
    # 1. loc和iloc
    print(f"\n1. loc和iloc:")
    print(f"使用loc: {ts.loc['2024-01-03']}")
    print(f"使用iloc: {ts.iloc[2]}")
    
    # 2. 条件索引
    print(f"\n2. 条件索引:")
    positive_returns = ts[ts > 0]
    print(f"正收益日:\n{positive_returns}")
    
    # 3. 复合条件
    print(f"\n3. 复合条件:")
    moderate_returns = ts[(ts > -0.5) & (ts < 0.5)]
    print(f"适中收益(-0.5到0.5):\n{moderate_returns}")
    
    # 4. where方法
    print(f"\n4. where方法:")
    ts_filtered = ts.where(ts > 0, 0)  # 负值替换为0
    print(f"负值替换为0:\n{ts_filtered}")
    
    return ts

# 运行示例
time_series = advanced_series_indexing()

2.1.4 Series运算和方法

数学运算

def series_mathematical_operations():
    """Series数学运算"""
    
    # 创建示例数据
    s1 = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
    s2 = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])
    
    print("Series 1:")
    print(s1)
    print("\nSeries 2:")
    print(s2)
    
    # 1. 基础运算
    print(f"\n1. 基础运算:")
    print(f"加法:\n{s1 + s2}")
    print(f"减法:\n{s2 - s1}")
    print(f"乘法:\n{s1 * s2}")
    print(f"除法:\n{s2 / s1}")
    
    # 2. 标量运算
    print(f"\n2. 标量运算:")
    print(f"s1 + 10:\n{s1 + 10}")
    print(f"s1 * 2:\n{s1 * 2}")
    print(f"s1 ** 2:\n{s1 ** 2}")
    
    # 3. 数学函数
    print(f"\n3. 数学函数:")
    print(f"平方根:\n{np.sqrt(s1)}")
    print(f"对数:\n{np.log(s1)}")
    print(f"指数:\n{np.exp(s1)}")
    
    # 4. 统计方法
    print(f"\n4. 统计方法:")
    print(f"总和: {s1.sum()}")
    print(f"平均值: {s1.mean()}")
    print(f"标准差: {s1.std()}")
    print(f"最大值: {s1.max()}")
    print(f"最小值: {s1.min()}")
    print(f"中位数: {s1.median()}")
    
    return s1, s2

# 运行示例
s1, s2 = series_mathematical_operations()

字符串操作

def series_string_operations():
    """Series字符串操作"""
    
    # 创建字符串Series
    names = pd.Series(['Alice Johnson', 'Bob Smith', 'Charlie Brown', 
                      'Diana Wilson', 'Eve Davis'])
    
    print("原始字符串Series:")
    print(names)
    
    # 1. 基础字符串方法
    print(f"\n1. 基础字符串方法:")
    print(f"转大写:\n{names.str.upper()}")
    print(f"转小写:\n{names.str.lower()}")
    print(f"字符串长度:\n{names.str.len()}")
    
    # 2. 字符串分割
    print(f"\n2. 字符串分割:")
    split_names = names.str.split(' ')
    print(f"分割结果:\n{split_names}")
    
    first_names = names.str.split(' ').str[0]
    last_names = names.str.split(' ').str[1]
    print(f"名字:\n{first_names}")
    print(f"姓氏:\n{last_names}")
    
    # 3. 字符串包含
    print(f"\n3. 字符串包含:")
    contains_son = names.str.contains('son')
    print(f"包含'son':\n{contains_son}")
    print(f"包含'son'的名字:\n{names[contains_son]}")
    
    # 4. 字符串替换
    print(f"\n4. 字符串替换:")
    replaced = names.str.replace('Johnson', 'Jackson')
    print(f"替换后:\n{replaced}")
    
    # 5. 正则表达式
    print(f"\n5. 正则表达式:")
    # 提取首字母
    initials = names.str.extract(r'(\w)\w* (\w)\w*')
    print(f"首字母:\n{initials}")
    
    return names

# 运行示例
names_series = series_string_operations()

2.2 DataFrame详解

2.2.1 DataFrame基础概念

DataFrame是pandas中的二维标记数据结构，具有行索引和列索引。可以将其视为Series的容器，或者类似于Excel表格或SQL表。

def dataframe_basic_concepts():
    """DataFrame基础概念演示"""
    
    print("DataFrame的基本结构:")
    print("行索引(index) + 列索引(columns) + 数据(values) = DataFrame")
    
    # 创建简单的DataFrame
    data = {
        'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
        'age': [25, 30, 35, 28],
        'city': ['New York', 'London', 'Tokyo', 'Paris'],
        'salary': [50000, 60000, 70000, 55000]
    }
    
    df = pd.DataFrame(data)
    print(f"\n基础DataFrame:")
    print(df)
    
    # DataFrame的基本属性
    print(f"\n基本属性:")
    print(f"形状: {df.shape}")
    print(f"行索引: {df.index}")
    print(f"列索引: {df.columns}")
    print(f"数据类型:\n{df.dtypes}")
    print(f"内存使用:\n{df.memory_usage()}")
    
    return df

# 运行示例
basic_df = dataframe_basic_concepts()

2.2.2 DataFrame创建方法

从字典创建

def create_dataframe_from_dict():
    """从字典创建DataFrame的各种方法"""
    
    # 1. 基础字典（列表值）
    data1 = {
        'product': ['A', 'B', 'C', 'D'],
        'price': [100, 150, 200, 120],
        'quantity': [10, 5, 8, 12]
    }
    df1 = pd.DataFrame(data1)
    print("1. 从基础字典创建:")
    print(df1)
    
    # 2. 指定索引
    df2 = pd.DataFrame(data1, index=['P1', 'P2', 'P3', 'P4'])
    print("\n2. 指定索引:")
    print(df2)
    
    # 3. 字典的字典
    data3 = {
        'Q1': {'sales': 1000, 'profit': 200, 'expenses': 800},
        'Q2': {'sales': 1200, 'profit': 250, 'expenses': 950},
        'Q3': {'sales': 1100, 'profit': 220, 'expenses': 880},
        'Q4': {'sales': 1300, 'profit': 280, 'expenses': 1020}
    }
    df3 = pd.DataFrame(data3).T  # 转置
    print("\n3. 从字典的字典创建:")
    print(df3)
    
    # 4. Series字典
    s1 = pd.Series([1, 2, 3, 4], name='A')
    s2 = pd.Series([10, 20, 30, 40], name='B')
    s3 = pd.Series([100, 200, 300, 400], name='C')
    
    df4 = pd.DataFrame({'col1': s1, 'col2': s2, 'col3': s3})
    print("\n4. 从Series字典创建:")
    print(df4)
    
    return df1, df2, df3, df4

# 运行示例
dict_dataframes = create_dataframe_from_dict()

从列表创建

def create_dataframe_from_list():
    """从列表创建DataFrame"""
    
    # 1. 二维列表
    data1 = [
        ['Alice', 25, 'Engineer'],
        ['Bob', 30, 'Manager'],
        ['Charlie', 35, 'Analyst'],
        ['Diana', 28, 'Designer']
    ]
    df1 = pd.DataFrame(data1, columns=['Name', 'Age', 'Job'])
    print("1. 从二维列表创建:")
    print(df1)
    
    # 2. 字典列表
    data2 = [
        {'name': 'Alice', 'age': 25, 'city': 'NY'},
        {'name': 'Bob', 'age': 30, 'city': 'LA'},
        {'name': 'Charlie', 'age': 35, 'city': 'SF'},
        {'name': 'Diana', 'age': 28, 'city': 'Chicago'}
    ]
    df2 = pd.DataFrame(data2)
    print("\n2. 从字典列表创建:")
    print(df2)
    
    # 3. 元组列表
    data3 = [
        ('Product A', 100, 10),
        ('Product B', 150, 5),
        ('Product C', 200, 8),
        ('Product D', 120, 12)
    ]
    df3 = pd.DataFrame(data3, columns=['Product', 'Price', 'Stock'])
    print("\n3. 从元组列表创建:")
    print(df3)
    
    return df1, df2, df3

# 运行示例
list_dataframes = create_dataframe_from_list()

从NumPy数组创建

def create_dataframe_from_numpy():
    """从NumPy数组创建DataFrame"""
    
    # 1. 二维数组
    np.random.seed(42)
    arr1 = np.random.randn(4, 3)
    df1 = pd.DataFrame(arr1, columns=['A', 'B', 'C'])
    print("1. 从二维数组创建:")
    print(df1)
    
    # 2. 指定索引和列名
    df2 = pd.DataFrame(arr1, 
                      index=['Row1', 'Row2', 'Row3', 'Row4'],
                      columns=['Col1', 'Col2', 'Col3'])
    print("\n2. 指定索引和列名:")
    print(df2)
    
    # 3. 结构化数组
    structured_arr = np.array([
        ('Alice', 25, 50000),
        ('Bob', 30, 60000),
        ('Charlie', 35, 70000)
    ], dtype=[('name', 'U10'), ('age', 'i4'), ('salary', 'i4')])
    
    df3 = pd.DataFrame(structured_arr)
    print("\n3. 从结构化数组创建:")
    print(df3)
    
    # 4. 特殊数组
    # 创建时间序列数据
    dates = pd.date_range('2024-01-01', periods=5)
    values = np.random.randn(5, 3)
    df4 = pd.DataFrame(values, index=dates, columns=['Stock_A', 'Stock_B', 'Stock_C'])
    print("\n4. 时间序列DataFrame:")
    print(df4)
    
    return df1, df2, df3, df4

# 运行示例
numpy_dataframes = create_dataframe_from_numpy()

2.2.3 DataFrame索引操作

列操作

def dataframe_column_operations():
    """DataFrame列操作"""
    
    # 创建示例DataFrame
    df = pd.DataFrame({
        'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
        'age': [25, 30, 35, 28, 32],
        'city': ['NY', 'LA', 'SF', 'Chicago', 'Boston'],
        'salary': [50000, 60000, 70000, 55000, 65000],
        'department': ['IT', 'HR', 'IT', 'Marketing', 'IT']
    })
    
    print("原始DataFrame:")
    print(df)
    
    # 1. 选择单列
    print(f"\n1. 选择单列:")
    print(f"姓名列:\n{df['name']}")
    print(f"年龄列:\n{df.age}")  # 属性访问方式
    
    # 2. 选择多列
    print(f"\n2. 选择多列:")
    subset = df[['name', 'age', 'salary']]
    print(subset)
    
    # 3. 添加新列
    print(f"\n3. 添加新列:")
    df['bonus'] = df['salary'] * 0.1
    df['total_compensation'] = df['salary'] + df['bonus']
    print(df[['name', 'salary', 'bonus', 'total_compensation']])
    
    # 4. 删除列
    print(f"\n4. 删除列:")
    df_dropped = df.drop(['bonus'], axis=1)
    print(f"删除bonus列后的列名: {df_dropped.columns.tolist()}")
    
    # 5. 重命名列
    print(f"\n5. 重命名列:")
    df_renamed = df.rename(columns={'name': 'employee_name', 'age': 'employee_age'})
    print(f"重命名后的列名: {df_renamed.columns.tolist()}")
    
    # 6. 列排序
    print(f"\n6. 列排序:")
    df_reordered = df[['name', 'department', 'age', 'city', 'salary', 'bonus', 'total_compensation']]
    print(df_reordered.head())
    
    return df

# 运行示例
df_with_columns = dataframe_column_operations()

行操作

def dataframe_row_operations():
    """DataFrame行操作"""
    
    # 使用之前创建的DataFrame
    df = pd.DataFrame({
        'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
        'age': [25, 30, 35, 28, 32],
        'city': ['NY', 'LA', 'SF', 'Chicago', 'Boston'],
        'salary': [50000, 60000, 70000, 55000, 65000],
        'department': ['IT', 'HR', 'IT', 'Marketing', 'IT']
    })
    
    print("原始DataFrame:")
    print(df)
    
    # 1. 选择单行
    print(f"\n1. 选择单行:")
    print(f"第一行:\n{df.iloc[0]}")
    print(f"索引为2的行:\n{df.loc[2]}")
    
    # 2. 选择多行
    print(f"\n2. 选择多行:")
    print(f"前三行:\n{df.iloc[:3]}")
    print(f"指定行:\n{df.iloc[[0, 2, 4]]}")
    
    # 3. 条件选择
    print(f"\n3. 条件选择:")
    it_employees = df[df['department'] == 'IT']
    print(f"IT部门员工:\n{it_employees}")
    
    high_salary = df[df['salary'] > 60000]
    print(f"\n高薪员工:\n{high_salary}")
    
    # 4. 复合条件
    print(f"\n4. 复合条件:")
    young_high_earners = df[(df['age'] < 30) & (df['salary'] > 55000)]
    print(f"年轻高薪员工:\n{young_high_earners}")
    
    # 5. 添加新行
    print(f"\n5. 添加新行:")
    new_row = pd.DataFrame({
        'name': ['Frank'],
        'age': [29],
        'city': ['Seattle'],
        'salary': [58000],
        'department': ['Finance']
    })
    df_with_new = pd.concat([df, new_row], ignore_index=True)
    print(df_with_new)
    
    # 6. 删除行
    print(f"\n6. 删除行:")
    df_dropped = df.drop([1, 3])  # 删除索引1和3的行
    print(df_dropped)
    
    return df

# 运行示例
df_with_rows = dataframe_row_operations()

loc和iloc详解

def loc_iloc_detailed():
    """loc和iloc详细用法"""
    
    # 创建示例DataFrame
    df = pd.DataFrame({
        'A': [1, 2, 3, 4, 5],
        'B': [10, 20, 30, 40, 50],
        'C': [100, 200, 300, 400, 500],
        'D': ['a', 'b', 'c', 'd', 'e']
    }, index=['row1', 'row2', 'row3', 'row4', 'row5'])
    
    print("示例DataFrame:")
    print(df)
    
    # 1. iloc - 基于位置的索引
    print(f"\n1. iloc - 基于位置的索引:")
    print(f"第一行第一列: {df.iloc[0, 0]}")
    print(f"前两行前两列:\n{df.iloc[:2, :2]}")
    print(f"最后一行:\n{df.iloc[-1]}")
    print(f"第2和第4行:\n{df.iloc[[1, 3]]}")
    
    # 2. loc - 基于标签的索引
    print(f"\n2. loc - 基于标签的索引:")
    print(f"row1的A列: {df.loc['row1', 'A']}")
    print(f"row1到row3的A到C列:\n{df.loc['row1':'row3', 'A':'C']}")
    print(f"指定行和列:\n{df.loc[['row1', 'row3'], ['A', 'C']]}")
    
    # 3. 条件索引
    print(f"\n3. 条件索引:")
    print(f"A列大于2的行:\n{df.loc[df['A'] > 2]}")
    print(f"A列大于2且B列小于40:\n{df.loc[(df['A'] > 2) & (df['B'] < 40)]}")
    
    # 4. 设置值
    print(f"\n4. 设置值:")
    df_copy = df.copy()
    df_copy.loc['row1', 'A'] = 999
    df_copy.iloc[0, 1] = 888
    print(f"修改后:\n{df_copy}")
    
    # 5. 添加新行和列
    print(f"\n5. 添加新行和列:")
    df_copy.loc['row6'] = [6, 60, 600, 'f']
    df_copy.loc[:, 'E'] = [1000, 2000, 3000, 4000, 5000, 6000]
    print(df_copy)
    
    return df

# 运行示例
loc_iloc_df = loc_iloc_detailed()

2.2.4 DataFrame基本操作

数据查看和描述

def dataframe_inspection():
    """DataFrame数据查看和描述"""
    
    # 创建更大的示例数据
    np.random.seed(42)
    n_rows = 100
    
    df = pd.DataFrame({
        'employee_id': range(1001, 1001 + n_rows),
        'name': [f'Employee_{i}' for i in range(n_rows)],
        'age': np.random.randint(22, 65, n_rows),
        'department': np.random.choice(['IT', 'HR', 'Finance', 'Marketing', 'Sales'], n_rows),
        'salary': np.random.normal(60000, 15000, n_rows),
        'years_experience': np.random.randint(0, 20, n_rows),
        'performance_score': np.random.uniform(1, 5, n_rows)
    })
    
    # 1. 基本信息
    print("1. 基本信息:")
    print(f"形状: {df.shape}")
    print(f"列数: {len(df.columns)}")
    print(f"行数: {len(df)}")
    print(f"总元素数: {df.size}")
    
    # 2. 数据类型和内存使用
    print(f"\n2. 数据类型和内存使用:")
    print(df.info())
    
    # 3. 前几行和后几行
    print(f"\n3. 前5行:")
    print(df.head())
    
    print(f"\n后5行:")
    print(df.tail())
    
    # 4. 随机抽样
    print(f"\n4. 随机抽样:")
    print(df.sample(5))
    
    # 5. 描述性统计
    print(f"\n5. 描述性统计:")
    print(df.describe())
    
    # 6. 非数值列的描述
    print(f"\n6. 非数值列的描述:")
    print(df.describe(include=['object']))
    
    # 7. 唯一值统计
    print(f"\n7. 唯一值统计:")
    print(f"部门唯一值: {df['department'].unique()}")
    print(f"部门计数:\n{df['department'].value_counts()}")
    
    # 8. 缺失值检查
    print(f"\n8. 缺失值检查:")
    print(f"缺失值统计:\n{df.isnull().sum()}")
    
    return df

# 运行示例
large_df = dataframe_inspection()

数据排序

def dataframe_sorting():
    """DataFrame排序操作"""
    
    # 创建示例数据
    df = pd.DataFrame({
        'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
        'age': [25, 30, 35, 28, 32],
        'salary': [50000, 60000, 70000, 55000, 65000],
        'department': ['IT', 'HR', 'IT', 'Marketing', 'IT'],
        'performance': [4.2, 3.8, 4.5, 4.0, 4.3]
    })
    
    print("原始DataFrame:")
    print(df)
    
    # 1. 按单列排序
    print(f"\n1. 按年龄排序:")
    df_age_sorted = df.sort_values('age')
    print(df_age_sorted)
    
    # 2. 降序排序
    print(f"\n2. 按薪资降序排序:")
    df_salary_desc = df.sort_values('salary', ascending=False)
    print(df_salary_desc)
    
    # 3. 多列排序
    print(f"\n3. 按部门和薪资排序:")
    df_multi_sorted = df.sort_values(['department', 'salary'], ascending=[True, False])
    print(df_multi_sorted)
    
    # 4. 按索引排序
    print(f"\n4. 按索引排序:")
    df_shuffled = df.sample(frac=1)  # 打乱顺序
    print("打乱后:")
    print(df_shuffled)
    
    df_index_sorted = df_shuffled.sort_index()
    print("按索引排序后:")
    print(df_index_sorted)
    
    # 5. 自定义排序
    print(f"\n5. 自定义排序:")
    # 按部门自定义顺序排序
    dept_order = ['IT', 'HR', 'Marketing']
    df['dept_cat'] = pd.Categorical(df['department'], categories=dept_order, ordered=True)
    df_custom_sorted = df.sort_values('dept_cat')
    print(df_custom_sorted[['name', 'department', 'salary']])
    
    return df

# 运行示例
sorted_df = dataframe_sorting()

2.3 索引和标签高级操作

2.3.1 索引设置和重置

def index_operations():
    """索引操作详解"""
    
    # 创建示例DataFrame
    df = pd.DataFrame({
        'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
        'age': [25, 30, 35, 28, 32],
        'city': ['NY', 'LA', 'SF', 'Chicago', 'Boston'],
        'salary': [50000, 60000, 70000, 55000, 65000]
    })
    
    print("原始DataFrame:")
    print(df)
    
    # 1. 设置索引
    print(f"\n1. 设置name为索引:")
    df_name_index = df.set_index('name')
    print(df_name_index)
    
    # 2. 多级索引
    print(f"\n2. 设置多级索引:")
    df_multi = df.set_index(['city', 'name'])
    print(df_multi)
    
    # 3. 重置索引
    print(f"\n3. 重置索引:")
    df_reset = df_name_index.reset_index()
    print(df_reset)
    
    # 4. 删除索引
    print(f"\n4. 删除索引:")
    df_drop_index = df_name_index.reset_index(drop=True)
    print(df_drop_index)
    
    # 5. 索引重命名
    print(f"\n5. 索引重命名:")
    df_renamed_index = df_name_index.rename_axis('employee_name')
    print(df_renamed_index)
    
    return df_name_index, df_multi

# 运行示例
name_indexed_df, multi_indexed_df = index_operations()

2.3.2 多级索引操作

def multiindex_operations():
    """多级索引操作"""
    
    # 创建多级索引DataFrame
    arrays = [
        ['A', 'A', 'B', 'B', 'C', 'C'],
        ['one', 'two', 'one', 'two', 'one', 'two']
    ]
    index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
    
    df = pd.DataFrame({
        'value1': [1, 2, 3, 4, 5, 6],
        'value2': [10, 20, 30, 40, 50, 60],
        'value3': [100, 200, 300, 400, 500, 600]
    }, index=index)
    
    print("多级索引DataFrame:")
    print(df)
    
    # 1. 选择特定级别
    print(f"\n1. 选择A级别的所有数据:")
    print(df.loc['A'])
    
    # 2. 选择特定组合
    print(f"\n2. 选择A-one组合:")
    print(df.loc[('A', 'one')])
    
    # 3. 使用xs方法
    print(f"\n3. 使用xs选择second级别为'one'的数据:")
    print(df.xs('one', level='second'))
    
    # 4. 级别交换
    print(f"\n4. 交换索引级别:")
    df_swapped = df.swaplevel('first', 'second')
    print(df_swapped)
    
    # 5. 索引排序
    print(f"\n5. 按索引排序:")
    df_sorted = df_swapped.sort_index()
    print(df_sorted)
    
    # 6. 展开索引
    print(f"\n6. 展开索引:")
    df_unstacked = df.unstack()
    print(df_unstacked)
    
    # 7. 堆叠索引
    print(f"\n7. 重新堆叠:")
    df_stacked = df_unstacked.stack()
    print(df_stacked)
    
    return df

# 运行示例
multi_df = multiindex_operations()

2.4 数据类型和内存管理

2.4.1 数据类型优化

def optimize_data_types():
    """数据类型优化"""
    
    # 创建示例数据
    df = pd.DataFrame({
        'int_col': [1, 2, 3, 4, 5] * 1000,
        'float_col': [1.1, 2.2, 3.3, 4.4, 5.5] * 1000,
        'string_col': ['A', 'B', 'C', 'D', 'E'] * 1000,
        'bool_col': [True, False, True, False, True] * 1000,
        'date_col': pd.date_range('2024-01-01', periods=5000, freq='H')
    })
    
    print("优化前的数据类型和内存使用:")
    print(df.dtypes)
    print(f"\n内存使用: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")
    
    # 1. 整数类型优化
    print(f"\n1. 整数类型优化:")
    df['int_col'] = pd.to_numeric(df['int_col'], downcast='integer')
    print(f"优化后int_col类型: {df['int_col'].dtype}")
    
    # 2. 浮点类型优化
    print(f"\n2. 浮点类型优化:")
    df['float_col'] = pd.to_numeric(df['float_col'], downcast='float')
    print(f"优化后float_col类型: {df['float_col'].dtype}")
    
    # 3. 分类数据优化
    print(f"\n3. 分类数据优化:")
    print(f"优化前string_col内存: {df['string_col'].memory_usage(deep=True)} bytes")
    df['string_col'] = df['string_col'].astype('category')
    print(f"优化后string_col类型: {df['string_col'].dtype}")
    print(f"优化后string_col内存: {df['string_col'].memory_usage(deep=True)} bytes")
    
    # 4. 布尔类型
    print(f"\n4. 布尔类型:")
    print(f"bool_col类型: {df['bool_col'].dtype}")
    
    print(f"\n优化后总内存使用: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")
    
    return df

# 运行示例
optimized_df = optimize_data_types()

2.4.2 分类数据处理

def categorical_data_demo():
    """分类数据处理演示"""
    
    # 创建分类数据
    sizes = ['small', 'medium', 'large', 'small', 'large', 'medium', 'small']
    colors = ['red', 'blue', 'green', 'red', 'blue', 'green', 'red']
    
    df = pd.DataFrame({
        'size': sizes,
        'color': colors,
        'price': [10, 20, 30, 15, 35, 25, 12]
    })
    
    print("原始DataFrame:")
    print(df)
    print(f"数据类型:\n{df.dtypes}")
    
    # 1. 转换为分类数据
    print(f"\n1. 转换为分类数据:")
    df['size'] = df['size'].astype('category')
    df['color'] = df['color'].astype('category')
    print(f"转换后数据类型:\n{df.dtypes}")
    
    # 2. 查看分类信息
    print(f"\n2. 分类信息:")
    print(f"size分类: {df['size'].cat.categories}")
    print(f"color分类: {df['color'].cat.categories}")
    
    # 3. 有序分类
    print(f"\n3. 有序分类:")
    size_order = ['small', 'medium', 'large']
    df['size'] = pd.Categorical(df['size'], categories=size_order, ordered=True)
    print(f"有序分类: {df['size'].cat.ordered}")
    print(f"分类顺序: {df['size'].cat.categories}")
    
    # 4. 分类统计
    print(f"\n4. 分类统计:")
    print(f"size计数:\n{df['size'].value_counts()}")
    print(f"color计数:\n{df['color'].value_counts()}")
    
    # 5. 添加新分类
    print(f"\n5. 添加新分类:")
    df['size'] = df['size'].cat.add_categories(['extra_large'])
    print(f"添加后的分类: {df['size'].cat.categories}")
    
    # 6. 重命名分类
    print(f"\n6. 重命名分类:")
    df['size'] = df['size'].cat.rename_categories({
        'small': 'S',
        'medium': 'M', 
        'large': 'L',
        'extra_large': 'XL'
    })
    print(f"重命名后:\n{df['size']}")
    
    return df

# 运行示例
categorical_df = categorical_data_demo()

2.5 本章小结

2.5.1 核心知识点

Series数据结构
- 一维标记数组
- 支持多种创建方式
- 强大的索引功能
- 丰富的操作方法
DataFrame数据结构
- 二维标记数据结构
- 行索引和列索引
- 灵活的数据操作
- 类似Excel表格
索引系统
- 位置索引和标签索引
- loc和iloc的区别
- 多级索引操作
- 索引设置和重置
数据类型管理
- 数据类型优化
- 分类数据处理
- 内存使用优化

2.5.2 实践要点

理解Series和DataFrame的区别和联系
掌握各种创建数据结构的方法
熟练使用索引进行数据选择
合理使用数据类型优化内存

2.5.3 下一步学习

在下一章中，我们将学习： - 各种文件格式的读取和写入 - 数据库连接和操作 - 网络数据获取 - 数据导出和保存

练习题

创建一个包含学生信息的DataFrame，包括姓名、年龄、成绩等
使用不同方法选择和过滤数据
练习多级索引的创建和操作
优化DataFrame的数据类型和内存使用
创建分类数据并进行统计分析

记住：熟练掌握数据结构是数据分析的基础！

📂 分类导航

▶ 学与练
- ▶ 软件技术基础
  - ▶ 操作系统技术
    - Linux实战
    - ▶ Linux技巧
      - debug-remote-api.md
  - ▶ 容器化与编排
    - Docker实战
    - ▶ Docker高级
- ▶ 前端开发技术
  - ▶ 框架与库
    - js
    - vue
  - ▶ 前端生态
    - bootstrap
    - vue-ssr
- ▶ 后端开发技术
  - ▶ 编程语言
    - ▶ Java
    - ▶ Go
      - go-server.md
      - mini.md
    - Rust
    - Python
    - csharp
  - ▶ 中间件
    - redis
    - ▶ minio
      - minio.md
    - elasticsearch
    - kafka
    - elk
    - caddy
  - ▶ 数据库
    - MySQL
    - SQLServer
    - ▶ Dameng
      - sql.md
    - clickhouse
- ▶ 数据开发与运维
  - ▶ 数据开发
    - hadoop
  - ▶ 运维开发
    - ▶ CI/CD
      - jenkins
    - ▶ 自动化
      - allinssl.md
    - ▶ 日志处理
      - elk
    - ▶ 监控
- 软件速学教程
▶ 软件园
- AI智能体与应用
- 开发工具与环境
- AI 开发和编排
- 业务与生产力应用
- 数据和中间件
▶ 工具箱
- 内容管理
- 编码解码
- ▶ 系统监控
  - miaotixing.md
- ▶ 日常工具
- 工具命令
- 使用教程

📚 第2章：数据结构详解（Series和DataFrame）

2.1 Series详解

2.1.1 Series基础概念

2.1.2 Series创建方法

从列表创建

从字典创建

从NumPy数组创建

2.1.3 Series索引操作

基础索引

高级索引

2.1.4 Series运算和方法

数学运算

字符串操作

2.2 DataFrame详解

2.2.1 DataFrame基础概念

2.2.2 DataFrame创建方法

从字典创建

从列表创建

从NumPy数组创建

2.2.3 DataFrame索引操作

列操作

行操作

loc和iloc详解

2.2.4 DataFrame基本操作

数据查看和描述

数据排序

2.3 索引和标签高级操作

2.3.1 索引设置和重置

2.3.2 多级索引操作

2.4 数据类型和内存管理

2.4.1 数据类型优化

2.4.2 分类数据处理

2.5 本章小结

2.5.1 核心知识点

2.5.2 实践要点

2.5.3 下一步学习

📂 分类导航

📰 最新文章