学习目标

通过本章学习,你将掌握: - 列表的高级操作和性能优化 - 元组和命名元组的使用 - 字典的高级用法和技巧 - 集合的操作和应用 - 字符串的深入处理 - 数据结构的选择和性能比较

5.1 列表的高级操作

列表的创建和初始化

# 不同的列表创建方式
# 1. 直接创建
fruits = ["apple", "banana", "orange"]
print(f"水果列表: {fruits}")

# 2. 使用list()构造函数
numbers = list(range(1, 6))
print(f"数字列表: {numbers}")

# 3. 列表推导式
squares = [x**2 for x in range(1, 6)]
print(f"平方数列表: {squares}")

# 4. 重复元素
zeros = [0] * 5
print(f"零列表: {zeros}")

# 5. 嵌套列表
matrix = [[0 for _ in range(3)] for _ in range(3)]
print(f"3x3矩阵: {matrix}")

# 注意:避免这样创建嵌套列表
# wrong_matrix = [[0] * 3] * 3  # 所有行都是同一个对象的引用

# 6. 从其他可迭代对象创建
char_list = list("hello")
print(f"字符列表: {char_list}")

# 7. 使用生成器创建
def fibonacci(n):
    a, b = 0, 1
    for _ in range(n):
        yield a
        a, b = b, a + b

fib_list = list(fibonacci(10))
print(f"斐波那契数列: {fib_list}")

列表的高级操作

# 列表的切片操作
numbers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# 基本切片
print(f"前5个元素: {numbers[:5]}")
print(f"后5个元素: {numbers[-5:]}")
print(f"中间元素: {numbers[2:8]}")
print(f"每隔一个: {numbers[::2]}")
print(f"反转列表: {numbers[::-1]}")

# 切片赋值
numbers[2:5] = [20, 30, 40]
print(f"切片赋值后: {numbers}")

# 插入多个元素
numbers[2:2] = [15, 25]
print(f"插入元素后: {numbers}")

# 删除元素
del numbers[2:4]
print(f"删除元素后: {numbers}")

# 列表的方法
fruits = ["apple", "banana", "orange", "apple"]

# 添加元素
fruits.append("grape")  # 末尾添加
fruits.insert(1, "kiwi")  # 指定位置插入
fruits.extend(["mango", "pear"])  # 扩展列表
print(f"添加后: {fruits}")

# 删除元素
fruits.remove("apple")  # 删除第一个匹配的元素
popped = fruits.pop()  # 删除并返回最后一个元素
popped_index = fruits.pop(1)  # 删除并返回指定索引的元素
print(f"删除后: {fruits}")
print(f"弹出的元素: {popped}, {popped_index}")

# 查找和计数
fruits = ["apple", "banana", "orange", "apple", "grape"]
print(f"apple的索引: {fruits.index('apple')}")
print(f"apple的数量: {fruits.count('apple')}")

# 排序和反转
numbers = [3, 1, 4, 1, 5, 9, 2, 6]
numbers.sort()  # 原地排序
print(f"排序后: {numbers}")

numbers.reverse()  # 原地反转
print(f"反转后: {numbers}")

# 不修改原列表的排序
original = [3, 1, 4, 1, 5, 9, 2, 6]
sorted_list = sorted(original)  # 返回新列表
reversed_list = list(reversed(original))  # 返回新列表
print(f"原列表: {original}")
print(f"排序的新列表: {sorted_list}")
print(f"反转的新列表: {reversed_list}")

# 自定义排序
students = [
    {"name": "Alice", "age": 20, "grade": 85},
    {"name": "Bob", "age": 19, "grade": 92},
    {"name": "Charlie", "age": 21, "grade": 78}
]

# 按年龄排序
students_by_age = sorted(students, key=lambda x: x["age"])
print("按年龄排序:")
for student in students_by_age:
    print(f"  {student['name']}: {student['age']}岁")

# 按成绩降序排序
students_by_grade = sorted(students, key=lambda x: x["grade"], reverse=True)
print("按成绩降序排序:")
for student in students_by_grade:
    print(f"  {student['name']}: {student['grade']}分")

# 多条件排序
students_multi = sorted(students, key=lambda x: (-x["grade"], x["age"]))
print("按成绩降序,年龄升序排序:")
for student in students_multi:
    print(f"  {student['name']}: {student['grade']}分, {student['age']}岁")

列表的性能优化

import time
import sys
from collections import deque

# 列表vs双端队列性能比较
def performance_comparison():
    n = 100000
    
    # 列表在头部插入(性能差)
    start_time = time.time()
    lst = []
    for i in range(n):
        lst.insert(0, i)
    list_time = time.time() - start_time
    
    # 双端队列在头部插入(性能好)
    start_time = time.time()
    dq = deque()
    for i in range(n):
        dq.appendleft(i)
    deque_time = time.time() - start_time
    
    print(f"列表头部插入{n}个元素耗时: {list_time:.4f}秒")
    print(f"双端队列头部插入{n}个元素耗时: {deque_time:.4f}秒")
    print(f"性能提升: {list_time/deque_time:.1f}倍")

# performance_comparison()

# 内存使用优化
def memory_optimization():
    # 预分配空间
    n = 1000000
    
    # 方法1:逐个添加(内存重新分配多次)
    start_time = time.time()
    lst1 = []
    for i in range(n):
        lst1.append(i)
    time1 = time.time() - start_time
    
    # 方法2:预分配空间
    start_time = time.time()
    lst2 = [None] * n
    for i in range(n):
        lst2[i] = i
    time2 = time.time() - start_time
    
    # 方法3:使用列表推导式
    start_time = time.time()
    lst3 = [i for i in range(n)]
    time3 = time.time() - start_time
    
    print(f"逐个添加耗时: {time1:.4f}秒")
    print(f"预分配空间耗时: {time2:.4f}秒")
    print(f"列表推导式耗时: {time3:.4f}秒")
    
    # 内存使用
    print(f"列表内存使用: {sys.getsizeof(lst1)} 字节")

# memory_optimization()

# 列表的深拷贝和浅拷贝
import copy

original = [[1, 2, 3], [4, 5, 6]]

# 浅拷贝
shallow_copy = original.copy()  # 或者 original[:] 或 list(original)
shallow_copy[0][0] = 999
print(f"原列表: {original}")  # 内部列表被修改
print(f"浅拷贝: {shallow_copy}")

# 深拷贝
original = [[1, 2, 3], [4, 5, 6]]
deep_copy = copy.deepcopy(original)
deep_copy[0][0] = 999
print(f"原列表: {original}")  # 内部列表未被修改
print(f"深拷贝: {deep_copy}")

# 列表的内存视图
numbers = [1, 2, 3, 4, 5]
view = memoryview(bytearray(numbers))
print(f"内存视图: {list(view)}")

列表的高级应用

# 列表的分组和分块
def chunk_list(lst, chunk_size):
    """将列表分成指定大小的块"""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

numbers = list(range(1, 21))
chunks = chunk_list(numbers, 5)
print(f"分块结果: {chunks}")

# 列表的扁平化
def flatten_list(nested_list):
    """扁平化嵌套列表"""
    result = []
    for item in nested_list:
        if isinstance(item, list):
            result.extend(flatten_list(item))
        else:
            result.append(item)
    return result

nested = [1, [2, 3], [4, [5, 6]], 7]
flat = flatten_list(nested)
print(f"扁平化结果: {flat}")

# 使用itertools.chain扁平化
from itertools import chain
nested_simple = [[1, 2], [3, 4], [5, 6]]
flat_chain = list(chain.from_iterable(nested_simple))
print(f"chain扁平化: {flat_chain}")

# 列表的去重(保持顺序)
def remove_duplicates(lst):
    """去重并保持原顺序"""
    seen = set()
    result = []
    for item in lst:
        if item not in seen:
            seen.add(item)
            result.append(item)
    return result

with_duplicates = [1, 2, 3, 2, 4, 1, 5]
unique = remove_duplicates(with_duplicates)
print(f"去重结果: {unique}")

# 使用dict.fromkeys去重(Python 3.7+保持顺序)
unique_dict = list(dict.fromkeys(with_duplicates))
print(f"dict去重结果: {unique_dict}")

# 列表的交集、并集、差集
list1 = [1, 2, 3, 4, 5]
list2 = [4, 5, 6, 7, 8]

# 交集
intersection = [x for x in list1 if x in list2]
print(f"交集: {intersection}")

# 并集(去重)
union = list(set(list1) | set(list2))
print(f"并集: {sorted(union)}")

# 差集
difference = [x for x in list1 if x not in list2]
print(f"差集: {difference}")

# 列表的旋转
def rotate_list(lst, k):
    """向右旋转列表k位"""
    if not lst or k == 0:
        return lst
    k = k % len(lst)
    return lst[-k:] + lst[:-k]

numbers = [1, 2, 3, 4, 5]
rotated = rotate_list(numbers, 2)
print(f"旋转2位: {rotated}")

# 列表的滑动窗口
def sliding_window(lst, window_size):
    """生成滑动窗口"""
    for i in range(len(lst) - window_size + 1):
        yield lst[i:i + window_size]

numbers = [1, 2, 3, 4, 5, 6, 7]
windows = list(sliding_window(numbers, 3))
print(f"滑动窗口(大小3): {windows}")

5.2 元组和命名元组

元组的基础操作

# 元组的创建
# 1. 使用圆括号
point = (3, 4)
print(f"坐标点: {point}")

# 2. 不使用圆括号(但推荐使用)
color = 255, 128, 0
print(f"颜色: {color}")

# 3. 单元素元组(注意逗号)
single = (42,)  # 或者 42,
print(f"单元素元组: {single}")
print(f"类型: {type(single)}")

# 4. 空元组
empty = ()
print(f"空元组: {empty}")

# 5. 使用tuple()构造函数
from_list = tuple([1, 2, 3])
from_string = tuple("hello")
print(f"从列表创建: {from_list}")
print(f"从字符串创建: {from_string}")

# 元组的不可变性
point = (3, 4)
# point[0] = 5  # 这会引发TypeError

# 但是元组中的可变对象可以修改
data = ([1, 2], [3, 4])
data[0].append(3)  # 修改列表内容
print(f"修改后的元组: {data}")

# 元组的操作
numbers = (1, 2, 3, 4, 5)

# 索引和切片
print(f"第一个元素: {numbers[0]}")
print(f"最后一个元素: {numbers[-1]}")
print(f"前三个元素: {numbers[:3]}")

# 长度和成员检查
print(f"元组长度: {len(numbers)}")
print(f"3在元组中: {3 in numbers}")

# 计数和查找
data = (1, 2, 3, 2, 4, 2, 5)
print(f"2的数量: {data.count(2)}")
print(f"2的第一个索引: {data.index(2)}")

# 元组的连接和重复
tuple1 = (1, 2, 3)
tuple2 = (4, 5, 6)
combined = tuple1 + tuple2
repeated = tuple1 * 3
print(f"连接: {combined}")
print(f"重复: {repeated}")

# 元组的解包
point = (3, 4)
x, y = point
print(f"x: {x}, y: {y}")

# 多重赋值
a, b, c = 1, 2, 3
print(f"a: {a}, b: {b}, c: {c}")

# 交换变量
a, b = b, a
print(f"交换后 a: {a}, b: {b}")

# 扩展解包(Python 3+)
numbers = (1, 2, 3, 4, 5)
first, *middle, last = numbers
print(f"第一个: {first}")
print(f"中间: {middle}")
print(f"最后一个: {last}")

# 忽略某些值
data = ("Alice", 25, "Engineer", "Beijing")
name, age, *_ = data
print(f"姓名: {name}, 年龄: {age}")

命名元组

from collections import namedtuple
from typing import NamedTuple

# 使用namedtuple创建
Point = namedtuple('Point', ['x', 'y'])
point = Point(3, 4)
print(f"点坐标: {point}")
print(f"x坐标: {point.x}")
print(f"y坐标: {point.y}")

# 使用字符串定义字段
Person = namedtuple('Person', 'name age city')
person = Person('Alice', 25, 'Beijing')
print(f"个人信息: {person}")

# 命名元组的方法
print(f"字段名: {person._fields}")
print(f"转为字典: {person._asdict()}")

# 替换字段值(返回新对象)
new_person = person._replace(age=26)
print(f"更新年龄后: {new_person}")

# 从可迭代对象创建
data = ['Bob', 30, 'Shanghai']
person2 = Person._make(data)
print(f"从列表创建: {person2}")

# 设置默认值
Student = namedtuple('Student', ['name', 'age', 'grade'], defaults=[0])
student1 = Student('Charlie', 20)  # grade使用默认值
student2 = Student('David', 21, 85)
print(f"学生1: {student1}")
print(f"学生2: {student2}")

# 使用typing.NamedTuple(推荐,支持类型注解)
class Employee(NamedTuple):
    name: str
    age: int
    department: str
    salary: float = 0.0  # 默认值
    
    def get_annual_salary(self) -> float:
        return self.salary * 12
    
    def __str__(self) -> str:
        return f"{self.name} ({self.age}岁) - {self.department}部门"

emp = Employee('Alice', 28, 'IT', 8000)
print(f"员工信息: {emp}")
print(f"年薪: {emp.get_annual_salary()}")

# 命名元组的应用场景

# 1. 函数返回多个值
def get_circle_info(radius):
    import math
    CircleInfo = namedtuple('CircleInfo', ['radius', 'area', 'circumference'])
    area = math.pi * radius ** 2
    circumference = 2 * math.pi * radius
    return CircleInfo(radius, area, circumference)

circle = get_circle_info(5)
print(f"圆的信息: 半径={circle.radius}, 面积={circle.area:.2f}, 周长={circle.circumference:.2f}")

# 2. 配置对象
Config = namedtuple('Config', ['host', 'port', 'debug', 'timeout'])
config = Config('localhost', 8080, True, 30)
print(f"配置: {config}")

# 3. 数据记录
LogEntry = namedtuple('LogEntry', ['timestamp', 'level', 'message'])
from datetime import datetime

log = LogEntry(datetime.now(), 'INFO', '应用启动成功')
print(f"日志: [{log.timestamp}] {log.level}: {log.message}")

# 4. 坐标和向量运算
class Vector(NamedTuple):
    x: float
    y: float
    
    def __add__(self, other):
        return Vector(self.x + other.x, self.y + other.y)
    
    def __mul__(self, scalar):
        return Vector(self.x * scalar, self.y * scalar)
    
    def magnitude(self):
        return (self.x ** 2 + self.y ** 2) ** 0.5
    
    def __str__(self):
        return f"Vector({self.x}, {self.y})"

v1 = Vector(3, 4)
v2 = Vector(1, 2)
v3 = v1 + v2
v4 = v1 * 2

print(f"向量1: {v1}")
print(f"向量2: {v2}")
print(f"向量相加: {v3}")
print(f"向量乘标量: {v4}")
print(f"向量1的模: {v1.magnitude()}")

5.3 字典的高级用法

字典的创建和基本操作

# 字典的创建方式
# 1. 字面量语法
student = {'name': 'Alice', 'age': 20, 'grade': 85}
print(f"学生信息: {student}")

# 2. dict()构造函数
student2 = dict(name='Bob', age=21, grade=90)
print(f"学生2: {student2}")

# 3. 从键值对列表创建
pairs = [('name', 'Charlie'), ('age', 19), ('grade', 88)]
student3 = dict(pairs)
print(f"学生3: {student3}")

# 4. 从两个列表创建
keys = ['name', 'age', 'grade']
values = ['David', 22, 92]
student4 = dict(zip(keys, values))
print(f"学生4: {student4}")

# 5. 字典推导式
squares = {x: x**2 for x in range(1, 6)}
print(f"平方数字典: {squares}")

# 6. 使用dict.fromkeys()
default_scores = dict.fromkeys(['math', 'english', 'science'], 0)
print(f"默认分数: {default_scores}")

# 字典的基本操作
student = {'name': 'Alice', 'age': 20, 'grade': 85}

# 访问元素
print(f"姓名: {student['name']}")
print(f"年龄: {student.get('age', '未知')}")
print(f"城市: {student.get('city', '未知')}")

# 修改和添加
student['age'] = 21
student['city'] = 'Beijing'
print(f"修改后: {student}")

# 删除元素
del student['grade']
popped_value = student.pop('city', '默认值')
print(f"删除后: {student}")
print(f"弹出的值: {popped_value}")

# 清空字典
temp_dict = {'a': 1, 'b': 2}
temp_dict.clear()
print(f"清空后: {temp_dict}")

# 字典的视图对象
data = {'a': 1, 'b': 2, 'c': 3}
keys_view = data.keys()
values_view = data.values()
items_view = data.items()

print(f"键视图: {list(keys_view)}")
print(f"值视图: {list(values_view)}")
print(f"项视图: {list(items_view)}")

# 视图是动态的
data['d'] = 4
print(f"添加元素后的键视图: {list(keys_view)}")

字典的高级操作

# 字典的合并
dict1 = {'a': 1, 'b': 2}
dict2 = {'c': 3, 'd': 4}
dict3 = {'b': 20, 'e': 5}

# 方法1: update()
merged1 = dict1.copy()
merged1.update(dict2)
print(f"update合并: {merged1}")

# 方法2: ** 解包(Python 3.5+)
merged2 = {**dict1, **dict2, **dict3}
print(f"解包合并: {merged2}")

# 方法3: | 操作符(Python 3.9+)
# merged3 = dict1 | dict2 | dict3
# print(f"操作符合并: {merged3}")

# 字典的setdefault方法
counter = {}
words = ['apple', 'banana', 'apple', 'cherry', 'banana', 'apple']

for word in words:
    counter.setdefault(word, 0)
    counter[word] += 1

print(f"词频统计: {counter}")

# 使用defaultdict
from collections import defaultdict

# 自动创建默认值
counter2 = defaultdict(int)
for word in words:
    counter2[word] += 1

print(f"defaultdict词频: {dict(counter2)}")

# 嵌套字典的defaultdict
nested_dict = defaultdict(lambda: defaultdict(int))
nested_dict['fruits']['apple'] = 5
nested_dict['fruits']['banana'] = 3
nested_dict['vegetables']['carrot'] = 2

print(f"嵌套字典: {dict(nested_dict)}")

# 字典的排序
students = {
    'Alice': 85,
    'Bob': 92,
    'Charlie': 78,
    'David': 96
}

# 按键排序
sorted_by_key = dict(sorted(students.items()))
print(f"按姓名排序: {sorted_by_key}")

# 按值排序
sorted_by_value = dict(sorted(students.items(), key=lambda x: x[1], reverse=True))
print(f"按成绩排序: {sorted_by_value}")

# 获取最高分和最低分
best_student = max(students.items(), key=lambda x: x[1])
worst_student = min(students.items(), key=lambda x: x[1])
print(f"最高分: {best_student}")
print(f"最低分: {worst_student}")

# 字典的过滤
high_scores = {name: score for name, score in students.items() if score >= 90}
print(f"高分学生: {high_scores}")

# 字典的反转
reversed_dict = {v: k for k, v in students.items()}
print(f"反转字典: {reversed_dict}")

# 多级字典操作
company = {
    'IT': {
        'Alice': {'salary': 8000, 'level': 'senior'},
        'Bob': {'salary': 6000, 'level': 'junior'}
    },
    'HR': {
        'Charlie': {'salary': 7000, 'level': 'senior'},
        'David': {'salary': 5000, 'level': 'junior'}
    }
}

# 安全访问嵌套字典
def safe_get(dictionary, *keys, default=None):
    """安全获取嵌套字典的值"""
    for key in keys:
        if isinstance(dictionary, dict) and key in dictionary:
            dictionary = dictionary[key]
        else:
            return default
    return dictionary

alice_salary = safe_get(company, 'IT', 'Alice', 'salary')
nonexistent = safe_get(company, 'Finance', 'Eve', 'salary', default=0)
print(f"Alice的薪水: {alice_salary}")
print(f"不存在的值: {nonexistent}")

# 扁平化嵌套字典
def flatten_dict(d, parent_key='', sep='.'):
    """扁平化嵌套字典"""
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

flat_company = flatten_dict(company)
print("扁平化公司数据:")
for key, value in flat_company.items():
    print(f"  {key}: {value}")

字典的特殊用法

# 使用字典实现switch语句
def handle_operation(operation, x, y):
    operations = {
        'add': lambda a, b: a + b,
        'subtract': lambda a, b: a - b,
        'multiply': lambda a, b: a * b,
        'divide': lambda a, b: a / b if b != 0 else None
    }
    
    func = operations.get(operation)
    if func:
        return func(x, y)
    else:
        return "未知操作"

print(f"加法: {handle_operation('add', 5, 3)}")
print(f"除法: {handle_operation('divide', 10, 2)}")
print(f"未知: {handle_operation('unknown', 1, 2)}")

# 字典作为缓存
class FibonacciCache:
    def __init__(self):
        self.cache = {0: 0, 1: 1}
    
    def fibonacci(self, n):
        if n in self.cache:
            return self.cache[n]
        
        result = self.fibonacci(n-1) + self.fibonacci(n-2)
        self.cache[n] = result
        return result
    
    def get_cache_size(self):
        return len(self.cache)

fib_calc = FibonacciCache()
print(f"斐波那契数列第20项: {fib_calc.fibonacci(20)}")
print(f"缓存大小: {fib_calc.get_cache_size()}")

# 使用functools.lru_cache装饰器
from functools import lru_cache

@lru_cache(maxsize=128)
def fibonacci_cached(n):
    if n < 2:
        return n
    return fibonacci_cached(n-1) + fibonacci_cached(n-2)

print(f"缓存版斐波那契第30项: {fibonacci_cached(30)}")
print(f"缓存信息: {fibonacci_cached.cache_info()}")

# 字典的弱引用
import weakref

class DataStore:
    def __init__(self):
        self._data = weakref.WeakValueDictionary()
    
    def store(self, key, obj):
        self._data[key] = obj
    
    def get(self, key):
        return self._data.get(key)
    
    def keys(self):
        return list(self._data.keys())

class TempObject:
    def __init__(self, value):
        self.value = value
    
    def __del__(self):
        print(f"对象 {self.value} 被删除")

store = DataStore()
obj1 = TempObject("test1")
obj2 = TempObject("test2")

store.store("key1", obj1)
store.store("key2", obj2)

print(f"存储的键: {store.keys()}")

# 删除强引用
del obj1
print(f"删除obj1后的键: {store.keys()}")

# 字典的内存优化(__slots__)
class RegularClass:
    def __init__(self, x, y):
        self.x = x
        self.y = y

class SlottedClass:
    __slots__ = ['x', 'y']
    
    def __init__(self, x, y):
        self.x = x
        self.y = y

import sys

regular = RegularClass(1, 2)
slotted = SlottedClass(1, 2)

print(f"普通类实例大小: {sys.getsizeof(regular)} + {sys.getsizeof(regular.__dict__)} = {sys.getsizeof(regular) + sys.getsizeof(regular.__dict__)}")
print(f"__slots__类实例大小: {sys.getsizeof(slotted)}")

5.4 集合的操作和应用

集合的基础操作

# 集合的创建
# 1. 使用花括号
fruits = {'apple', 'banana', 'orange'}
print(f"水果集合: {fruits}")

# 2. 使用set()构造函数
numbers = set([1, 2, 3, 4, 5])
print(f"数字集合: {numbers}")

# 3. 从字符串创建
chars = set('hello')
print(f"字符集合: {chars}")

# 4. 空集合(注意不能用{})
empty_set = set()
print(f"空集合: {empty_set}")
print(f"空字典: {}")

# 集合的特性:无序、唯一
data = [1, 2, 2, 3, 3, 3, 4, 5]
unique_data = set(data)
print(f"去重后: {unique_data}")

# 集合的基本操作
fruits = {'apple', 'banana', 'orange'}

# 添加元素
fruits.add('grape')
print(f"添加grape后: {fruits}")

# 添加多个元素
fruits.update(['kiwi', 'mango'])
print(f"添加多个元素后: {fruits}")

# 删除元素
fruits.remove('banana')  # 如果元素不存在会抛出KeyError
fruits.discard('pear')   # 如果元素不存在不会抛出异常
popped = fruits.pop()    # 随机删除一个元素
print(f"删除操作后: {fruits}")
print(f"弹出的元素: {popped}")

# 清空集合
temp_set = {'a', 'b', 'c'}
temp_set.clear()
print(f"清空后: {temp_set}")

# 成员检查
fruits = {'apple', 'banana', 'orange'}
print(f"apple在集合中: {'apple' in fruits}")
print(f"grape在集合中: {'grape' in fruits}")

# 集合长度
print(f"集合大小: {len(fruits)}")

集合运算

# 集合运算
set1 = {1, 2, 3, 4, 5}
set2 = {4, 5, 6, 7, 8}
set3 = {1, 2, 3}

# 并集(union)
union1 = set1 | set2
union2 = set1.union(set2)
print(f"并集: {union1}")
print(f"并集(方法): {union2}")

# 交集(intersection)
intersection1 = set1 & set2
intersection2 = set1.intersection(set2)
print(f"交集: {intersection1}")
print(f"交集(方法): {intersection2}")

# 差集(difference)
difference1 = set1 - set2
difference2 = set1.difference(set2)
print(f"差集(set1-set2): {difference1}")
print(f"差集(方法): {difference2}")

# 对称差集(symmetric_difference)
sym_diff1 = set1 ^ set2
sym_diff2 = set1.symmetric_difference(set2)
print(f"对称差集: {sym_diff1}")
print(f"对称差集(方法): {sym_diff2}")

# 子集和超集检查
print(f"set3是set1的子集: {set3.issubset(set1)}")
print(f"set1是set3的超集: {set1.issuperset(set3)}")
print(f"set1和set2不相交: {set1.isdisjoint(set2)}")

# 就地运算(修改原集合)
original = {1, 2, 3}
print(f"原始集合: {original}")

original |= {4, 5}  # 就地并集
print(f"就地并集后: {original}")

original &= {1, 2, 3, 4}  # 就地交集
print(f"就地交集后: {original}")

original -= {1}  # 就地差集
print(f"就地差集后: {original}")

original ^= {2, 5, 6}  # 就地对称差集
print(f"就地对称差集后: {original}")

# 多个集合运算
sets = [{1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {4, 5, 6}]

# 所有集合的并集
all_union = set().union(*sets)
print(f"所有集合的并集: {all_union}")

# 所有集合的交集
all_intersection = set.intersection(*sets)
print(f"所有集合的交集: {all_intersection}")

# 使用reduce进行复杂运算
from functools import reduce

# 逐步并集
step_union = reduce(lambda x, y: x | y, sets)
print(f"逐步并集: {step_union}")

# 逐步交集
step_intersection = reduce(lambda x, y: x & y, sets)
print(f"逐步交集: {step_intersection}")

集合的高级应用

# 数据去重和清洗
def clean_data(data_list):
    """清洗数据:去重、去空值、转换类型"""
    # 去除None和空字符串
    cleaned = [item for item in data_list if item is not None and item != '']
    
    # 转换为字符串并去重
    unique_strings = set(str(item).strip() for item in cleaned)
    
    # 去除空字符串
    unique_strings.discard('')
    
    return sorted(unique_strings)

raw_data = [1, 2, '2', 3, None, '', '  3  ', 4, 4, '1']
cleaned = clean_data(raw_data)
print(f"原始数据: {raw_data}")
print(f"清洗后: {cleaned}")

# 权限管理系统
class PermissionManager:
    def __init__(self):
        self.user_permissions = {}
        self.role_permissions = {
            'admin': {'read', 'write', 'delete', 'execute'},
            'editor': {'read', 'write'},
            'viewer': {'read'}
        }
    
    def assign_role(self, user, role):
        """为用户分配角色"""
        if role in self.role_permissions:
            if user not in self.user_permissions:
                self.user_permissions[user] = set()
            self.user_permissions[user].update(self.role_permissions[role])
    
    def grant_permission(self, user, permission):
        """为用户授予特定权限"""
        if user not in self.user_permissions:
            self.user_permissions[user] = set()
        self.user_permissions[user].add(permission)
    
    def revoke_permission(self, user, permission):
        """撤销用户权限"""
        if user in self.user_permissions:
            self.user_permissions[user].discard(permission)
    
    def has_permission(self, user, permission):
        """检查用户是否有特定权限"""
        return permission in self.user_permissions.get(user, set())
    
    def get_user_permissions(self, user):
        """获取用户所有权限"""
        return self.user_permissions.get(user, set()).copy()
    
    def get_common_permissions(self, *users):
        """获取多个用户的共同权限"""
        if not users:
            return set()
        
        common = self.user_permissions.get(users[0], set()).copy()
        for user in users[1:]:
            common &= self.user_permissions.get(user, set())
        return common
    
    def get_unique_permissions(self, user, *other_users):
        """获取用户独有的权限"""
        user_perms = self.user_permissions.get(user, set())
        others_perms = set()
        for other_user in other_users:
            others_perms |= self.user_permissions.get(other_user, set())
        return user_perms - others_perms

# 使用权限管理系统
pm = PermissionManager()

# 分配角色
pm.assign_role('alice', 'admin')
pm.assign_role('bob', 'editor')
pm.assign_role('charlie', 'viewer')

# 授予额外权限
pm.grant_permission('bob', 'execute')
pm.grant_permission('charlie', 'write')

print(f"Alice的权限: {pm.get_user_permissions('alice')}")
print(f"Bob的权限: {pm.get_user_permissions('bob')}")
print(f"Charlie的权限: {pm.get_user_permissions('charlie')}")

# 检查权限
print(f"Bob有写权限: {pm.has_permission('bob', 'write')}")
print(f"Charlie有删除权限: {pm.has_permission('charlie', 'delete')}")

# 共同权限和独有权限
common = pm.get_common_permissions('alice', 'bob', 'charlie')
unique_alice = pm.get_unique_permissions('alice', 'bob', 'charlie')

print(f"三人共同权限: {common}")
print(f"Alice独有权限: {unique_alice}")

# 标签系统
class TagSystem:
    def __init__(self):
        self.item_tags = {}  # 物品到标签的映射
        self.tag_items = {}  # 标签到物品的映射
    
    def add_tags(self, item, *tags):
        """为物品添加标签"""
        if item not in self.item_tags:
            self.item_tags[item] = set()
        
        for tag in tags:
            self.item_tags[item].add(tag)
            if tag not in self.tag_items:
                self.tag_items[tag] = set()
            self.tag_items[tag].add(item)
    
    def remove_tags(self, item, *tags):
        """移除物品的标签"""
        if item in self.item_tags:
            for tag in tags:
                self.item_tags[item].discard(tag)
                if tag in self.tag_items:
                    self.tag_items[tag].discard(item)
                    if not self.tag_items[tag]:
                        del self.tag_items[tag]
    
    def get_items_by_tags(self, *tags, mode='any'):
        """根据标签获取物品"""
        if not tags:
            return set()
        
        if mode == 'any':
            # 包含任意一个标签的物品
            result = set()
            for tag in tags:
                result |= self.tag_items.get(tag, set())
            return result
        elif mode == 'all':
            # 包含所有标签的物品
            result = self.tag_items.get(tags[0], set()).copy()
            for tag in tags[1:]:
                result &= self.tag_items.get(tag, set())
            return result
        else:
            raise ValueError("mode必须是'any'或'all'")
    
    def get_related_items(self, item):
        """获取相关物品(有共同标签的物品)"""
        if item not in self.item_tags:
            return set()
        
        item_tags = self.item_tags[item]
        related = set()
        
        for tag in item_tags:
            related |= self.tag_items.get(tag, set())
        
        related.discard(item)  # 移除自己
        return related
    
    def get_tag_similarity(self, item1, item2):
        """计算两个物品的标签相似度"""
        tags1 = self.item_tags.get(item1, set())
        tags2 = self.item_tags.get(item2, set())
        
        if not tags1 and not tags2:
            return 0.0
        
        intersection = len(tags1 & tags2)
        union = len(tags1 | tags2)
        
        return intersection / union if union > 0 else 0.0

# 使用标签系统
tag_sys = TagSystem()

# 添加物品和标签
tag_sys.add_tags('Python教程', 'programming', 'python', 'tutorial', 'beginner')
tag_sys.add_tags('Java教程', 'programming', 'java', 'tutorial', 'beginner')
tag_sys.add_tags('机器学习', 'programming', 'python', 'ai', 'advanced')
tag_sys.add_tags('数据结构', 'programming', 'algorithm', 'computer-science')
tag_sys.add_tags('Web开发', 'programming', 'web', 'javascript', 'html')

# 查询
python_items = tag_sys.get_items_by_tags('python')
print(f"Python相关物品: {python_items}")

beginner_programming = tag_sys.get_items_by_tags('programming', 'beginner', mode='all')
print(f"编程入门物品: {beginner_programming}")

# 相关物品推荐
related_to_python = tag_sys.get_related_items('Python教程')
print(f"Python教程相关物品: {related_to_python}")

# 相似度计算
similarity = tag_sys.get_tag_similarity('Python教程', 'Java教程')
print(f"Python教程和Java教程的相似度: {similarity:.2f}")

冻结集合(frozenset)

# frozenset的创建和使用
regular_set = {1, 2, 3, 4, 5}
frozen_set = frozenset([1, 2, 3, 4, 5])

print(f"普通集合: {regular_set}")
print(f"冻结集合: {frozen_set}")

# frozenset是不可变的
# frozen_set.add(6)  # 这会引发AttributeError

# frozenset可以作为字典的键
set_dict = {
    frozenset([1, 2]): 'group1',
    frozenset([3, 4]): 'group2',
    frozenset([5, 6]): 'group3'
}

print(f"以frozenset为键的字典: {set_dict}")

# frozenset可以作为集合的元素
set_of_sets = {
    frozenset([1, 2, 3]),
    frozenset([4, 5, 6]),
    frozenset([7, 8, 9])
}

print(f"集合的集合: {set_of_sets}")

# 图的表示(使用frozenset表示边)
class Graph:
    def __init__(self):
        self.edges = set()
        self.vertices = set()
    
    def add_edge(self, vertex1, vertex2):
        """添加无向边"""
        edge = frozenset([vertex1, vertex2])
        self.edges.add(edge)
        self.vertices.add(vertex1)
        self.vertices.add(vertex2)
    
    def has_edge(self, vertex1, vertex2):
        """检查是否存在边"""
        edge = frozenset([vertex1, vertex2])
        return edge in self.edges
    
    def get_neighbors(self, vertex):
        """获取顶点的邻居"""
        neighbors = set()
        for edge in self.edges:
            if vertex in edge:
                neighbors.update(edge - {vertex})
        return neighbors
    
    def __str__(self):
        return f"Graph(vertices={self.vertices}, edges={self.edges})"

# 使用图
graph = Graph()
graph.add_edge('A', 'B')
graph.add_edge('B', 'C')
graph.add_edge('C', 'A')
graph.add_edge('A', 'D')

print(graph)
print(f"A的邻居: {graph.get_neighbors('A')}")
print(f"A和C之间有边: {graph.has_edge('A', 'C')}")
print(f"A和E之间有边: {graph.has_edge('A', 'E')}")

5.5 字符串的深入处理

字符串的创建和基本操作

# 字符串的创建方式
# 1. 单引号和双引号
single_quote = 'Hello, World!'
double_quote = "Hello, World!"
print(f"单引号: {single_quote}")
print(f"双引号: {double_quote}")

# 2. 三引号(多行字符串)
multi_line = """
这是一个
多行字符串
可以包含换行符
"""
print(f"多行字符串: {multi_line}")

# 3. 原始字符串(r前缀)
raw_string = r"C:\Users\name\Documents\file.txt"
print(f"原始字符串: {raw_string}")

# 4. 格式化字符串(f前缀)
name = "Alice"
age = 25
f_string = f"我的名字是{name},今年{age}岁"
print(f"f字符串: {f_string}")

# 5. 字节字符串(b前缀)
byte_string = b"Hello, World!"
print(f"字节字符串: {byte_string}")
print(f"类型: {type(byte_string)}")

# 字符串的不可变性
original = "hello"
# original[0] = 'H'  # 这会引发TypeError
modified = 'H' + original[1:]  # 创建新字符串
print(f"原字符串: {original}")
print(f"修改后: {modified}")

# 字符串的基本操作
text = "Hello, World!"

# 长度
print(f"字符串长度: {len(text)}")

# 索引和切片
print(f"第一个字符: {text[0]}")
print(f"最后一个字符: {text[-1]}")
print(f"前5个字符: {text[:5]}")
print(f"后6个字符: {text[-6:]}")
print(f"每隔一个字符: {text[::2]}")
print(f"反转字符串: {text[::-1]}")

# 成员检查
print(f"'World'在字符串中: {'World' in text}")
print(f"'Python'在字符串中: {'Python' in text}")

# 字符串连接
first_name = "John"
last_name = "Doe"
full_name = first_name + " " + last_name
print(f"全名: {full_name}")

# 字符串重复
repeated = "Ha" * 5
print(f"重复字符串: {repeated}")

字符串方法详解

# 大小写转换
text = "Hello, World!"
print(f"小写: {text.lower()}")
print(f"大写: {text.upper()}")
print(f"首字母大写: {text.capitalize()}")
print(f"标题格式: {text.title()}")
print(f"大小写互换: {text.swapcase()}")

# 判断字符串类型
print(f"是否全为字母: {'Hello'.isalpha()}")
print(f"是否全为数字: {'12345'.isdigit()}")
print(f"是否全为字母数字: {'Hello123'.isalnum()}")
print(f"是否全为小写: {'hello'.islower()}")
print(f"是否全为大写: {'HELLO'.isupper()}")
print(f"是否为标题格式: {'Hello World'.istitle()}")
print(f"是否全为空白字符: {'   '.isspace()}")

# 字符串查找和替换
text = "Python is great. Python is powerful."

# 查找
print(f"'Python'第一次出现的位置: {text.find('Python')}")
print(f"'Python'最后一次出现的位置: {text.rfind('Python')}")
print(f"'Python'出现的次数: {text.count('Python')}")

# 检查开头和结尾
print(f"以'Python'开头: {text.startswith('Python')}")
print(f"以'powerful.'结尾: {text.endswith('powerful.')}")

# 替换
replaced = text.replace('Python', 'Java')
print(f"替换后: {replaced}")

# 限制替换次数
replaced_once = text.replace('Python', 'Java', 1)
print(f"只替换一次: {replaced_once}")

# 字符串分割和连接
sentence = "apple,banana,orange,grape"

# 分割
fruits = sentence.split(',')
print(f"分割结果: {fruits}")

# 按行分割
multi_line_text = "line1\nline2\nline3"
lines = multi_line_text.splitlines()
print(f"按行分割: {lines}")

# 限制分割次数
limited_split = sentence.split(',', 2)
print(f"限制分割次数: {limited_split}")

# 从右边分割
path = "/home/user/documents/file.txt"
dir_file = path.rsplit('/', 1)
print(f"目录和文件: {dir_file}")

# 连接
fruits_list = ['apple', 'banana', 'orange']
joined = ', '.join(fruits_list)
print(f"连接结果: {joined}")

# 连接数字列表(需要转换为字符串)
numbers = [1, 2, 3, 4, 5]
number_string = '-'.join(map(str, numbers))
print(f"数字连接: {number_string}")

# 字符串清理
text_with_spaces = "  Hello, World!  "
print(f"原文本: '{text_with_spaces}'")
print(f"去除两端空白: '{text_with_spaces.strip()}'")
print(f"去除左端空白: '{text_with_spaces.lstrip()}'")
print(f"去除右端空白: '{text_with_spaces.rstrip()}'")

# 去除指定字符
text_with_chars = "...Hello, World!..."
print(f"去除点号: '{text_with_chars.strip('.')}'")

# 字符串填充和对齐
text = "Python"
print(f"左对齐(20): '{text.ljust(20, '-')}'")
print(f"右对齐(20): '{text.rjust(20, '-')}'")
print(f"居中对齐(20): '{text.center(20, '-')}'")
print(f"零填充(10): '{text.zfill(10)}'")

# 数字的零填充
number = "42"
print(f"数字零填充: '{number.zfill(5)}'")

# 字符串编码和解码
text = "你好,世界!"

# 编码为字节
utf8_bytes = text.encode('utf-8')
gbk_bytes = text.encode('gbk')
print(f"UTF-8编码: {utf8_bytes}")
print(f"GBK编码: {gbk_bytes}")

# 解码为字符串
decoded_utf8 = utf8_bytes.decode('utf-8')
decoded_gbk = gbk_bytes.decode('gbk')
print(f"UTF-8解码: {decoded_utf8}")
print(f"GBK解码: {decoded_gbk}")

# 处理编码错误
malformed_bytes = b'\xff\xfe\x00\x00'
try:
    decoded = malformed_bytes.decode('utf-8')
except UnicodeDecodeError as e:
    print(f"解码错误: {e}")
    # 忽略错误
    decoded = malformed_bytes.decode('utf-8', errors='ignore')
    print(f"忽略错误后: '{decoded}'")
    # 替换错误
    decoded = malformed_bytes.decode('utf-8', errors='replace')
    print(f"替换错误后: '{decoded}'")

字符串格式化

# 1. % 格式化(旧式)
name = "Alice"
age = 25
score = 95.5

old_format = "姓名: %s, 年龄: %d, 分数: %.2f" % (name, age, score)
print(f"旧式格式化: {old_format}")

# 2. str.format() 方法
format_method = "姓名: {}, 年龄: {}, 分数: {:.2f}".format(name, age, score)
print(f"format方法: {format_method}")

# 位置参数
format_positional = "姓名: {0}, 年龄: {1}, 分数: {2:.2f}".format(name, age, score)
print(f"位置参数: {format_positional}")

# 关键字参数
format_keyword = "姓名: {name}, 年龄: {age}, 分数: {score:.2f}".format(
    name=name, age=age, score=score
)
print(f"关键字参数: {format_keyword}")

# 3. f-string(推荐,Python 3.6+)
f_string = f"姓名: {name}, 年龄: {age}, 分数: {score:.2f}"
print(f"f-string: {f_string}")

# f-string中的表达式
import math
radius = 5
f_expression = f"半径为{radius}的圆的面积是{math.pi * radius**2:.2f}"
print(f"f-string表达式: {f_expression}")

# f-string中的函数调用
def get_greeting(name):
    return f"Hello, {name}!"

f_function = f"问候语: {get_greeting('World')}"
print(f"f-string函数: {f_function}")

# 格式化规范
number = 1234.5678

print(f"默认: {number}")
print(f"保留2位小数: {number:.2f}")
print(f"科学计数法: {number:.2e}")
print(f"百分比: {number:.2%}")
print(f"千分位分隔符: {number:,.2f}")
print(f"右对齐(15位): '{number:>15.2f}'")
print(f"左对齐(15位): '{number:<15.2f}'")
print(f"居中对齐(15位): '{number:^15.2f}'")
print(f"零填充(10位): '{number:010.2f}'")

# 进制转换
num = 255
print(f"十进制: {num:d}")
print(f"二进制: {num:b}")
print(f"八进制: {num:o}")
print(f"十六进制: {num:x}")
print(f"十六进制(大写): {num:X}")

# 日期时间格式化
from datetime import datetime

now = datetime.now()
print(f"当前时间: {now}")
print(f"格式化时间: {now:%Y-%m-%d %H:%M:%S}")
print(f"简短日期: {now:%Y/%m/%d}")
print(f"时间: {now:%H:%M}")

# 字符串模板
from string import Template

template = Template("Hello, $name! You have $count new messages.")
result = template.substitute(name="Alice", count=5)
print(f"模板结果: {result}")

# 安全替换(缺少变量时不报错)
partial_result = template.safe_substitute(name="Bob")
print(f"部分替换: {partial_result}")

正则表达式

import re

# 基本匹配
text = "Hello, my phone number is 123-456-7890"
pattern = r"\d{3}-\d{3}-\d{4}"
match = re.search(pattern, text)
if match:
    print(f"找到电话号码: {match.group()}")

# 查找所有匹配
text = "Contact us at 123-456-7890 or 987-654-3210"
phones = re.findall(pattern, text)
print(f"所有电话号码: {phones}")

# 分组匹配
email_pattern = r"([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})"
email_text = "Contact: alice@example.com or bob@test.org"
email_matches = re.findall(email_pattern, email_text)
print(f"邮箱匹配: {email_matches}")

# 命名分组
named_pattern = r"(?P<user>[a-zA-Z0-9._%+-]+)@(?P<domain>[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})"
match = re.search(named_pattern, "alice@example.com")
if match:
    print(f"用户名: {match.group('user')}")
    print(f"域名: {match.group('domain')}")
    print(f"完整匹配: {match.groupdict()}")

# 替换
text = "The price is $100 and the tax is $15"
# 将美元符号替换为人民币符号
replaced = re.sub(r"\$", "¥", text)
print(f"替换后: {replaced}")

# 使用函数进行替换
def convert_currency(match):
    amount = float(match.group(1))
    return f"¥{amount * 6.5:.2f}"

text = "The price is $100.50 and the tax is $15.25"
converted = re.sub(r"\$(\d+\.\d+)", convert_currency, text)
print(f"货币转换: {converted}")

# 分割
text = "apple,banana;orange:grape"
fruits = re.split(r"[,;:]", text)
print(f"正则分割: {fruits}")

# 编译正则表达式(提高性能)
compiled_pattern = re.compile(r"\b\w+@\w+\.\w+\b")
text = "Emails: alice@test.com, bob@example.org, invalid-email"
emails = compiled_pattern.findall(text)
print(f"编译模式匹配: {emails}")

# 常用正则表达式模式
patterns = {
    "邮箱": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    "电话": r"\d{3}-\d{3}-\d{4}",
    "身份证": r"\d{17}[\dXx]",
    "IP地址": r"\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b",
    "URL": r"https?://[\w\.-]+\.[a-zA-Z]{2,}[\w\.-]*/?[\w\.-]*",
    "中文": r"[\u4e00-\u9fff]+",
    "数字": r"\d+",
    "浮点数": r"\d+\.\d+"
}

test_text = """
联系方式:
邮箱:alice@example.com
电话:138-1234-5678
网站:https://www.example.com
IP:192.168.1.1
价格:99.99元
中文:你好世界
"""

for name, pattern in patterns.items():
    matches = re.findall(pattern, test_text)
    if matches:
        print(f"{name}: {matches}")

字符串的高级应用

# 文本处理工具类
class TextProcessor:
    def __init__(self):
        self.stop_words = {'的', '了', '在', '是', '我', '有', '和', '就', 
                          'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at'}
    
    def clean_text(self, text):
        """清理文本"""
        import string
        # 转换为小写
        text = text.lower()
        # 移除标点符号
        text = text.translate(str.maketrans('', '', string.punctuation))
        # 移除多余空白
        text = ' '.join(text.split())
        return text
    
    def extract_words(self, text):
        """提取单词"""
        cleaned = self.clean_text(text)
        words = cleaned.split()
        # 过滤停用词
        return [word for word in words if word not in self.stop_words]
    
    def word_frequency(self, text):
        """词频统计"""
        words = self.extract_words(text)
        freq = {}
        for word in words:
            freq[word] = freq.get(word, 0) + 1
        return dict(sorted(freq.items(), key=lambda x: x[1], reverse=True))
    
    def find_keywords(self, text, top_n=5):
        """提取关键词"""
        freq = self.word_frequency(text)
        return list(freq.keys())[:top_n]
    
    def similarity(self, text1, text2):
        """计算文本相似度(简单版本)"""
        words1 = set(self.extract_words(text1))
        words2 = set(self.extract_words(text2))
        
        intersection = len(words1 & words2)
        union = len(words1 | words2)
        
        return intersection / union if union > 0 else 0
    
    def summarize(self, text, max_sentences=3):
        """简单文本摘要"""
        sentences = text.split('。')
        sentences = [s.strip() for s in sentences if s.strip()]
        
        if len(sentences) <= max_sentences:
            return text
        
        # 简单评分:句子长度和关键词数量
        word_freq = self.word_frequency(text)
        top_words = set(list(word_freq.keys())[:10])
        
        sentence_scores = []
        for sentence in sentences:
            words = self.extract_words(sentence)
            score = len(words) + sum(1 for word in words if word in top_words)
            sentence_scores.append((sentence, score))
        
        # 选择得分最高的句子
        top_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)[:max_sentences]
        return '。'.join([s[0] for s in top_sentences]) + '。'

# 使用文本处理器
processor = TextProcessor()

sample_text = """
Python是一种高级编程语言。它具有简洁的语法和强大的功能。
Python广泛应用于Web开发、数据科学、人工智能等领域。
学习Python可以帮助你快速开发各种应用程序。
Python的社区非常活跃,有丰富的第三方库可以使用。
"""

print(f"原文本: {sample_text}")
print(f"清理后: {processor.clean_text(sample_text)}")
print(f"词频统计: {processor.word_frequency(sample_text)}")
print(f"关键词: {processor.find_keywords(sample_text)}")
print(f"摘要: {processor.summarize(sample_text, 2)}")

# 文本相似度
text1 = "Python是一种编程语言"
text2 = "Python是一种高级编程语言"
similarity = processor.similarity(text1, text2)
print(f"文本相似度: {similarity:.2f}")

# 字符串的内存优化
import sys

# 字符串驻留(string interning)
a = "hello"
b = "hello"
print(f"字符串驻留: {a is b}")  # True

# 长字符串不会自动驻留
long_a = "hello" * 1000
long_b = "hello" * 1000
print(f"长字符串驻留: {long_a is long_b}")  # False

# 手动驻留
import sys
long_a_interned = sys.intern(long_a)
long_b_interned = sys.intern(long_b)
print(f"手动驻留: {long_a_interned is long_b_interned}")  # True

# 内存使用比较
print(f"普通字符串内存: {sys.getsizeof(long_a)}")
print(f"驻留字符串内存: {sys.getsizeof(long_a_interned)}")

# 字符串构建性能比较
import time

def string_concatenation(n):
    """字符串连接(性能差)"""
    result = ""
    for i in range(n):
        result += str(i)
    return result

def string_join(n):
    """使用join(性能好)"""
    return ''.join(str(i) for i in range(n))

def string_format(n):
    """使用格式化"""
    return ''.join(f"{i}" for i in range(n))

n = 10000

# 测试性能
start = time.time()
result1 = string_concatenation(n)
time1 = time.time() - start

start = time.time()
result2 = string_join(n)
time2 = time.time() - start

start = time.time()
result3 = string_format(n)
time3 = time.time() - start

print(f"字符串连接耗时: {time1:.4f}秒")
print(f"join方法耗时: {time2:.4f}秒")
print(f"格式化耗时: {time3:.4f}秒")
print(f"join比连接快: {time1/time2:.1f}倍")

5.6 数据结构的选择和性能比较

import time
import sys
from collections import deque, defaultdict, Counter
import random

# 性能测试函数
def performance_test(func, *args, **kwargs):
    """性能测试装饰器"""
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    return result, end_time - start_time

# 列表 vs 元组性能比较
def list_vs_tuple_performance():
    print("=== 列表 vs 元组性能比较 ===")
    n = 1000000
    
    # 创建性能
    _, list_create_time = performance_test(lambda: [i for i in range(n)])
    _, tuple_create_time = performance_test(lambda: tuple(i for i in range(n)))
    
    print(f"创建{n}个元素:")
    print(f"  列表: {list_create_time:.4f}秒")
    print(f"  元组: {tuple_create_time:.4f}秒")
    
    # 访问性能
    test_list = list(range(n))
    test_tuple = tuple(range(n))
    
    def access_test(container):
        total = 0
        for i in range(0, n, 1000):
            total += container[i]
        return total
    
    _, list_access_time = performance_test(access_test, test_list)
    _, tuple_access_time = performance_test(access_test, test_tuple)
    
    print(f"随机访问:")
    print(f"  列表: {list_access_time:.4f}秒")
    print(f"  元组: {tuple_access_time:.4f}秒")
    
    # 内存使用
    print(f"内存使用:")
    print(f"  列表: {sys.getsizeof(test_list)} 字节")
    print(f"  元组: {sys.getsizeof(test_tuple)} 字节")

# 列表 vs 双端队列性能比较
def list_vs_deque_performance():
    print("\n=== 列表 vs 双端队列性能比较 ===")
    n = 100000
    
    # 头部插入性能
    def list_prepend():
        lst = []
        for i in range(n):
            lst.insert(0, i)
        return lst
    
    def deque_prepend():
        dq = deque()
        for i in range(n):
            dq.appendleft(i)
        return dq
    
    _, list_prepend_time = performance_test(list_prepend)
    _, deque_prepend_time = performance_test(deque_prepend)
    
    print(f"头部插入{n}个元素:")
    print(f"  列表: {list_prepend_time:.4f}秒")
    print(f"  双端队列: {deque_prepend_time:.4f}秒")
    print(f"  性能提升: {list_prepend_time/deque_prepend_time:.1f}倍")
    
    # 尾部插入性能
    def list_append():
        lst = []
        for i in range(n):
            lst.append(i)
        return lst
    
    def deque_append():
        dq = deque()
        for i in range(n):
            dq.append(i)
        return dq
    
    _, list_append_time = performance_test(list_append)
    _, deque_append_time = performance_test(deque_append)
    
    print(f"尾部插入{n}个元素:")
    print(f"  列表: {list_append_time:.4f}秒")
    print(f"  双端队列: {deque_append_time:.4f}秒")

# 字典 vs 列表查找性能
def dict_vs_list_lookup():
    print("\n=== 字典 vs 列表查找性能 ===")
    n = 100000
    
    # 创建测试数据
    data_list = list(range(n))
    data_dict = {i: i for i in range(n)}
    search_items = random.sample(range(n), 1000)
    
    # 列表查找
    def list_lookup():
        found = 0
        for item in search_items:
            if item in data_list:
                found += 1
        return found
    
    # 字典查找
    def dict_lookup():
        found = 0
        for item in search_items:
            if item in data_dict:
                found += 1
        return found
    
    _, list_lookup_time = performance_test(list_lookup)
    _, dict_lookup_time = performance_test(dict_lookup)
    
    print(f"查找1000个元素:")
    print(f"  列表: {list_lookup_time:.4f}秒")
    print(f"  字典: {dict_lookup_time:.4f}秒")
    print(f"  性能提升: {list_lookup_time/dict_lookup_time:.1f}倍")

# 集合 vs 列表去重性能
def set_vs_list_dedup():
    print("\n=== 集合 vs 列表去重性能 ===")
    n = 100000
    
    # 创建有重复的数据
    data_with_duplicates = [random.randint(0, n//10) for _ in range(n)]
    
    # 使用列表去重
    def list_dedup():
        unique = []
        for item in data_with_duplicates:
            if item not in unique:
                unique.append(item)
        return unique
    
    # 使用集合去重
    def set_dedup():
        return list(set(data_with_duplicates))
    
    # 使用字典去重(保持顺序)
    def dict_dedup():
        return list(dict.fromkeys(data_with_duplicates))
    
    _, list_dedup_time = performance_test(list_dedup)
    _, set_dedup_time = performance_test(set_dedup)
    _, dict_dedup_time = performance_test(dict_dedup)
    
    print(f"去重{n}个元素:")
    print(f"  列表方法: {list_dedup_time:.4f}秒")
    print(f"  集合方法: {set_dedup_time:.4f}秒")
    print(f"  字典方法: {dict_dedup_time:.4f}秒")
    print(f"  集合比列表快: {list_dedup_time/set_dedup_time:.1f}倍")

# 不同数据结构的内存使用
def memory_usage_comparison():
    print("\n=== 内存使用比较 ===")
    n = 10000
    
    # 创建不同数据结构
    data_list = list(range(n))
    data_tuple = tuple(range(n))
    data_set = set(range(n))
    data_dict = {i: i for i in range(n)}
    data_deque = deque(range(n))
    
    print(f"{n}个整数的内存使用:")
    print(f"  列表: {sys.getsizeof(data_list):,} 字节")
    print(f"  元组: {sys.getsizeof(data_tuple):,} 字节")
    print(f"  集合: {sys.getsizeof(data_set):,} 字节")
    print(f"  字典: {sys.getsizeof(data_dict):,} 字节")
    print(f"  双端队列: {sys.getsizeof(data_deque):,} 字节")

# 数据结构选择指南
def data_structure_guide():
    print("\n=== 数据结构选择指南 ===")
    
    guide = {
        "列表 (list)": {
            "适用场景": ["需要有序存储", "需要索引访问", "需要修改元素", "需要在尾部频繁添加/删除"],
            "不适用场景": ["需要在头部频繁添加/删除", "需要快速查找", "不需要修改的数据"],
            "时间复杂度": "访问O(1), 搜索O(n), 插入O(n), 删除O(n)"
        },
        "元组 (tuple)": {
            "适用场景": ["不可变数据", "作为字典键", "函数返回多个值", "配置数据"],
            "不适用场景": ["需要修改数据", "需要频繁添加/删除"],
            "时间复杂度": "访问O(1), 搜索O(n)"
        },
        "字典 (dict)": {
            "适用场景": ["键值对存储", "快速查找", "计数统计", "缓存数据"],
            "不适用场景": ["需要有序存储(Python 3.7前)", "键不可哈希"],
            "时间复杂度": "访问O(1), 搜索O(1), 插入O(1), 删除O(1)"
        },
        "集合 (set)": {
            "适用场景": ["去重", "成员检查", "集合运算", "唯一性约束"],
            "不适用场景": ["需要有序存储", "需要索引访问", "元素不可哈希"],
            "时间复杂度": "搜索O(1), 插入O(1), 删除O(1)"
        },
        "双端队列 (deque)": {
            "适用场景": ["队列操作", "栈操作", "滑动窗口", "头尾频繁操作"],
            "不适用场景": ["需要随机访问", "需要索引操作"],
            "时间复杂度": "两端操作O(1), 中间操作O(n)"
        }
    }
    
    for structure, info in guide.items():
        print(f"\n{structure}:")
        print(f"  适用场景: {', '.join(info['适用场景'])}")
        print(f"  不适用场景: {', '.join(info['不适用场景'])}")
        print(f"  时间复杂度: {info['时间复杂度']}")

# 运行所有性能测试
if __name__ == "__main__":
    list_vs_tuple_performance()
    list_vs_deque_performance()
    dict_vs_list_lookup()
    set_vs_list_dedup()
    memory_usage_comparison()
    data_structure_guide()

5.7 综合示例:数据分析工具

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
数据分析工具

功能:
1. 数据加载和清洗
2. 统计分析
3. 数据可视化(文本形式)
4. 报告生成

作者: Python学习者
版本: 1.0
"""

import json
import csv
import statistics
from collections import defaultdict, Counter, namedtuple
from typing import List, Dict, Any, Optional, Union
import re
from datetime import datetime

# 数据记录类型
DataRecord = namedtuple('DataRecord', ['id', 'timestamp', 'category', 'value', 'metadata'])

class DataAnalyzer:
    """数据分析器"""
    
    def __init__(self):
        self.data: List[DataRecord] = []
        self.categories = set()
        self.date_range = None
    
    def load_from_csv(self, filename: str) -> bool:
        """从CSV文件加载数据"""
        try:
            with open(filename, 'r', encoding='utf-8') as file:
                reader = csv.DictReader(file)
                for row in reader:
                    record = DataRecord(
                        id=row.get('id', ''),
                        timestamp=row.get('timestamp', ''),
                        category=row.get('category', ''),
                        value=float(row.get('value', 0)),
                        metadata=row.get('metadata', '{}')
                    )
                    self.data.append(record)
                    self.categories.add(record.category)
            
            self._update_date_range()
            print(f"成功加载 {len(self.data)} 条记录")
            return True
            
        except Exception as e:
            print(f"加载CSV文件失败: {e}")
            return False
    
    def load_from_json(self, filename: str) -> bool:
        """从JSON文件加载数据"""
        try:
            with open(filename, 'r', encoding='utf-8') as file:
                json_data = json.load(file)
                
                for item in json_data:
                    record = DataRecord(
                        id=item.get('id', ''),
                        timestamp=item.get('timestamp', ''),
                        category=item.get('category', ''),
                        value=float(item.get('value', 0)),
                        metadata=json.dumps(item.get('metadata', {}))
                    )
                    self.data.append(record)
                    self.categories.add(record.category)
            
            self._update_date_range()
            print(f"成功加载 {len(self.data)} 条记录")
            return True
            
        except Exception as e:
            print(f"加载JSON文件失败: {e}")
            return False
    
    def add_sample_data(self) -> None:
        """添加示例数据"""
        import random
        from datetime import datetime, timedelta
        
        categories = ['销售', '市场', '技术', '客服', '财务']
        base_date = datetime.now() - timedelta(days=30)
        
        for i in range(1000):
            timestamp = base_date + timedelta(days=random.randint(0, 30))
            record = DataRecord(
                id=f"REC{i:04d}",
                timestamp=timestamp.strftime('%Y-%m-%d %H:%M:%S'),
                category=random.choice(categories),
                value=random.uniform(10, 1000),
                metadata=json.dumps({
                    'source': random.choice(['web', 'mobile', 'api']),
                    'region': random.choice(['北京', '上海', '广州', '深圳'])
                })
            )
            self.data.append(record)
            self.categories.add(record.category)
        
        self._update_date_range()
        print(f"添加了 1000 条示例数据")
    
    def _update_date_range(self) -> None:
        """更新日期范围"""
        if not self.data:
            return
        
        timestamps = [record.timestamp for record in self.data if record.timestamp]
        if timestamps:
            self.date_range = (min(timestamps), max(timestamps))
    
    def clean_data(self) -> Dict[str, int]:
        """清洗数据"""
        original_count = len(self.data)
        cleaned_data = []
        
        # 统计清洗信息
        stats = {
            '原始记录': original_count,
            '删除空值': 0,
            '删除异常值': 0,
            '删除重复': 0
        }
        
        # 去除空值和异常值
        for record in self.data:
            # 检查必要字段
            if not record.id or not record.category:
                stats['删除空值'] += 1
                continue
            
            # 检查数值范围
            if record.value < 0 or record.value > 10000:
                stats['删除异常值'] += 1
                continue
            
            cleaned_data.append(record)
        
        # 去重(基于ID)
        seen_ids = set()
        final_data = []
        for record in cleaned_data:
            if record.id in seen_ids:
                stats['删除重复'] += 1
                continue
            seen_ids.add(record.id)
            final_data.append(record)
        
        self.data = final_data
        stats['清洗后记录'] = len(self.data)
        
        # 重新计算类别和日期范围
        self.categories = {record.category for record in self.data}
        self._update_date_range()
        
        return stats
    
    def get_basic_stats(self) -> Dict[str, Any]:
        """获取基本统计信息"""
        if not self.data:
            return {}
        
        values = [record.value for record in self.data]
        
        return {
            '记录总数': len(self.data),
            '类别数量': len(self.categories),
            '数值统计': {
                '总和': sum(values),
                '平均值': statistics.mean(values),
                '中位数': statistics.median(values),
                '最大值': max(values),
                '最小值': min(values),
                '标准差': statistics.stdev(values) if len(values) > 1 else 0
            },
            '日期范围': self.date_range
        }
    
    def get_category_stats(self) -> Dict[str, Dict[str, float]]:
        """获取分类统计"""
        category_data = defaultdict(list)
        
        for record in self.data:
            category_data[record.category].append(record.value)
        
        stats = {}
        for category, values in category_data.items():
            if values:
                stats[category] = {
                    '数量': len(values),
                    '总和': sum(values),
                    '平均值': statistics.mean(values),
                    '最大值': max(values),
                    '最小值': min(values)
                }
        
        return stats
    
    def get_time_series(self, group_by='day') -> Dict[str, float]:
        """获取时间序列数据"""
        time_data = defaultdict(list)
        
        for record in self.data:
            if not record.timestamp:
                continue
            
            try:
                dt = datetime.strptime(record.timestamp, '%Y-%m-%d %H:%M:%S')
                
                if group_by == 'day':
                    key = dt.strftime('%Y-%m-%d')
                elif group_by == 'month':
                    key = dt.strftime('%Y-%m')
                elif group_by == 'hour':
                    key = dt.strftime('%Y-%m-%d %H:00')
                else:
                    key = record.timestamp
                
                time_data[key].append(record.value)
            except ValueError:
                continue
        
        # 计算每个时间点的统计值
        result = {}
        for time_key, values in time_data.items():
            result[time_key] = {
                '数量': len(values),
                '总和': sum(values),
                '平均值': statistics.mean(values)
            }
        
        return dict(sorted(result.items()))
    
    def find_outliers(self, method='iqr') -> List[DataRecord]:
        """查找异常值"""
        values = [record.value for record in self.data]
        
        if method == 'iqr':
            # 使用四分位距方法
            q1 = statistics.quantiles(values, n=4)[0]
            q3 = statistics.quantiles(values, n=4)[2]
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            
            outliers = [record for record in self.data 
                       if record.value < lower_bound or record.value > upper_bound]
        
        elif method == 'zscore':
            # 使用Z分数方法
            mean_val = statistics.mean(values)
            std_val = statistics.stdev(values) if len(values) > 1 else 0
            
            if std_val == 0:
                return []
            
            outliers = [record for record in self.data 
                       if abs(record.value - mean_val) / std_val > 3]
        
        else:
            outliers = []
        
        return outliers
    
    def get_top_records(self, n=10, by='value') -> List[DataRecord]:
        """获取排名前N的记录"""
        if by == 'value':
            return sorted(self.data, key=lambda x: x.value, reverse=True)[:n]
        elif by == 'timestamp':
            return sorted(self.data, key=lambda x: x.timestamp, reverse=True)[:n]
        else:
            return self.data[:n]
    
    def search_records(self, **criteria) -> List[DataRecord]:
        """搜索记录"""
        results = self.data
        
        for key, value in criteria.items():
            if key == 'category':
                results = [r for r in results if r.category == value]
            elif key == 'min_value':
                results = [r for r in results if r.value >= value]
            elif key == 'max_value':
                results = [r for r in results if r.value <= value]
            elif key == 'date_from':
                results = [r for r in results if r.timestamp >= value]
            elif key == 'date_to':
                results = [r for r in results if r.timestamp <= value]
            elif key == 'id_pattern':
                pattern = re.compile(value, re.IGNORECASE)
                results = [r for r in results if pattern.search(r.id)]
        
        return results
    
    def generate_report(self) -> str:
        """生成分析报告"""
        report = []
        report.append("=" * 60)
        report.append("           数据分析报告")
        report.append("=" * 60)
        report.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        report.append("")
        
        # 基本统计
        basic_stats = self.get_basic_stats()
        if basic_stats:
            report.append("基本统计信息:")
            report.append(f"  记录总数: {basic_stats['记录总数']:,}")
            report.append(f"  类别数量: {basic_stats['类别数量']}")
            
            num_stats = basic_stats['数值统计']
            report.append(f"  数值统计:")
            report.append(f"    总和: {num_stats['总和']:,.2f}")
            report.append(f"    平均值: {num_stats['平均值']:.2f}")
            report.append(f"    中位数: {num_stats['中位数']:.2f}")
            report.append(f"    最大值: {num_stats['最大值']:.2f}")
            report.append(f"    最小值: {num_stats['最小值']:.2f}")
            report.append(f"    标准差: {num_stats['标准差']:.2f}")
            
            if basic_stats['日期范围']:
                report.append(f"  日期范围: {basic_stats['日期范围'][0]} 到 {basic_stats['日期范围'][1]}")
        
        # 分类统计
        category_stats = self.get_category_stats()
        if category_stats:
            report.append("\n分类统计:")
            for category, stats in sorted(category_stats.items(), 
                                        key=lambda x: x[1]['总和'], reverse=True):
                report.append(f"  {category}:")
                report.append(f"    数量: {stats['数量']:,}")
                report.append(f"    总和: {stats['总和']:,.2f}")
                report.append(f"    平均值: {stats['平均值']:.2f}")
        
        # 异常值检测
        outliers = self.find_outliers()
        if outliers:
            report.append(f"\n异常值检测:")
            report.append(f"  发现 {len(outliers)} 个异常值")
            report.append(f"  异常值占比: {len(outliers)/len(self.data)*100:.1f}%")
            
            if len(outliers) <= 10:
                report.append("  异常值列表:")
                for outlier in outliers:
                    report.append(f"    {outlier.id}: {outlier.value:.2f} ({outlier.category})")
        
        # 排名前10
        top_records = self.get_top_records(10)
        if top_records:
            report.append("\n数值排名前10:")
            for i, record in enumerate(top_records, 1):
                report.append(f"  {i:2d}. {record.id}: {record.value:.2f} ({record.category})")
        
        report.append("\n" + "=" * 60)
        return "\n".join(report)
    
    def export_summary(self, filename: str) -> bool:
        """导出摘要到文件"""
        try:
            summary = {
                'basic_stats': self.get_basic_stats(),
                'category_stats': self.get_category_stats(),
                'outliers_count': len(self.find_outliers()),
                'top_records': [record._asdict() for record in self.get_top_records(10)],
                'export_time': datetime.now().isoformat()
            }
            
            with open(filename, 'w', encoding='utf-8') as file:
                json.dump(summary, file, ensure_ascii=False, indent=2)
            
            print(f"摘要已导出到: {filename}")
            return True
            
        except Exception as e:
            print(f"导出失败: {e}")
            return False

def main():
    """主函数"""
    analyzer = DataAnalyzer()
    
    while True:
        print("\n" + "=" * 40)
        print("        数据分析工具")
        print("=" * 40)
        print("1. 加载示例数据")
        print("2. 从CSV加载数据")
        print("3. 从JSON加载数据")
        print("4. 清洗数据")
        print("5. 基本统计")
        print("6. 分类统计")
        print("7. 时间序列分析")
        print("8. 异常值检测")
        print("9. 搜索记录")
        print("10. 生成报告")
        print("11. 导出摘要")
        print("12. 退出")
        
        try:
            choice = input("\n请选择操作 (1-12): ").strip()
            
            if choice == '1':
                analyzer.add_sample_data()
            
            elif choice == '2':
                filename = input("请输入CSV文件路径: ").strip()
                analyzer.load_from_csv(filename)
            
            elif choice == '3':
                filename = input("请输入JSON文件路径: ").strip()
                analyzer.load_from_json(filename)
            
            elif choice == '4':
                stats = analyzer.clean_data()
                print("数据清洗完成:")
                for key, value in stats.items():
                    print(f"  {key}: {value:,}")
            
            elif choice == '5':
                stats = analyzer.get_basic_stats()
                if stats:
                    print("\n基本统计信息:")
                    print(f"记录总数: {stats['记录总数']:,}")
                    print(f"类别数量: {stats['类别数量']}")
                    num_stats = stats['数值统计']
                    print(f"平均值: {num_stats['平均值']:.2f}")
                    print(f"中位数: {num_stats['中位数']:.2f}")
                    print(f"标准差: {num_stats['标准差']:.2f}")
                else:
                    print("没有数据")
            
            elif choice == '6':
                stats = analyzer.get_category_stats()
                if stats:
                    print("\n分类统计:")
                    for category, data in stats.items():
                        print(f"{category}: {data['数量']}条记录, 平均值: {data['平均值']:.2f}")
                else:
                    print("没有数据")
            
            elif choice == '7':
                group_by = input("分组方式 (day/month/hour): ").strip() or 'day'
                time_series = analyzer.get_time_series(group_by)
                if time_series:
                    print(f"\n时间序列分析 (按{group_by}分组):")
                    for time_key, data in list(time_series.items())[:10]:  # 只显示前10个
                        print(f"{time_key}: {data['数量']}条记录, 平均值: {data['平均值']:.2f}")
                    if len(time_series) > 10:
                        print(f"... 还有 {len(time_series) - 10} 个时间点")
                else:
                    print("没有时间数据")
            
            elif choice == '8':
                method = input("检测方法 (iqr/zscore): ").strip() or 'iqr'
                outliers = analyzer.find_outliers(method)
                print(f"\n发现 {len(outliers)} 个异常值:")
                for outlier in outliers[:10]:  # 只显示前10个
                    print(f"{outlier.id}: {outlier.value:.2f} ({outlier.category})")
                if len(outliers) > 10:
                    print(f"... 还有 {len(outliers) - 10} 个异常值")
            
            elif choice == '9':
                print("搜索条件 (留空跳过):")
                criteria = {}
                
                category = input("类别: ").strip()
                if category:
                    criteria['category'] = category
                
                min_value = input("最小值: ").strip()
                if min_value:
                    criteria['min_value'] = float(min_value)
                
                max_value = input("最大值: ").strip()
                if max_value:
                    criteria['max_value'] = float(max_value)
                
                results = analyzer.search_records(**criteria)
                print(f"\n找到 {len(results)} 条记录:")
                for record in results[:10]:  # 只显示前10个
                    print(f"{record.id}: {record.value:.2f} ({record.category})")
                if len(results) > 10:
                    print(f"... 还有 {len(results) - 10} 条记录")
            
            elif choice == '10':
                report = analyzer.generate_report()
                print(report)
                
                save = input("\n是否保存报告到文件? (y/n): ").lower().strip()
                if save in ['y', 'yes', '是']:
                    filename = f"data_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
                    with open(filename, 'w', encoding='utf-8') as f:
                        f.write(report)
                    print(f"报告已保存到: {filename}")
            
            elif choice == '11':
                filename = f"data_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
                analyzer.export_summary(filename)
            
            elif choice == '12':
                print("感谢使用数据分析工具!")
                break
            
            else:
                print("无效的选择,请重新输入")
        
        except ValueError as e:
            print(f"输入错误: {e}")
        except KeyboardInterrupt:
            print("\n\n程序被用户中断")
            break
        except Exception as e:
            print(f"发生错误: {e}")

if __name__ == "__main__":
    main()

运行数据分析工具:

python data_analyzer.py

本章小结

本章我们深入学习了Python的数据结构:

  1. 列表的高级操作:切片、方法、性能优化、高级应用
  2. 元组和命名元组:不可变性、解包、命名元组的使用
  3. 字典的高级用法:创建、操作、排序、嵌套字典处理
  4. 集合的操作和应用:基本操作、集合运算、实际应用场景
  5. 字符串的深入处理:方法详解、格式化、正则表达式、高级应用
  6. 性能比较和选择指南:不同数据结构的性能特点和适用场景
  7. 综合应用:通过数据分析工具整合所学知识

下一章预告

下一章我们将学习《函数和模块》,内容包括: - 函数的定义和调用 - 参数传递和作用域 - 装饰器和闭包 - 模块和包的使用 - 标准库介绍

练习题

基础练习

  1. 列表操作

    • 实现列表的快速排序算法
    • 编写函数合并两个有序列表
    • 实现列表的二分查找
  2. 字典应用

    • 实现一个简单的缓存系统
    • 编写词频统计程序
    • 创建多级字典的安全访问函数

进阶练习

  1. 集合运算

    • 实现集合的幂集生成
    • 编写图的邻接表表示
    • 实现简单的推荐系统
  2. 字符串处理

    • 实现简单的模板引擎
    • 编写文本相似度计算函数
    • 创建日志解析器

综合练习

  1. 项目实战
    • 扩展数据分析工具,添加更多统计功能
    • 实现简单的数据库查询引擎
    • 创建文本处理和分析工具

提示:数据结构是程序的基础,选择合适的数据结构能显著提高程序性能。多练习不同场景下的应用,培养数据结构选择的直觉。