学习目标
通过本章学习,你将掌握: - 列表的高级操作和性能优化 - 元组和命名元组的使用 - 字典的高级用法和技巧 - 集合的操作和应用 - 字符串的深入处理 - 数据结构的选择和性能比较
5.1 列表的高级操作
列表的创建和初始化
# 不同的列表创建方式
# 1. 直接创建
fruits = ["apple", "banana", "orange"]
print(f"水果列表: {fruits}")
# 2. 使用list()构造函数
numbers = list(range(1, 6))
print(f"数字列表: {numbers}")
# 3. 列表推导式
squares = [x**2 for x in range(1, 6)]
print(f"平方数列表: {squares}")
# 4. 重复元素
zeros = [0] * 5
print(f"零列表: {zeros}")
# 5. 嵌套列表
matrix = [[0 for _ in range(3)] for _ in range(3)]
print(f"3x3矩阵: {matrix}")
# 注意:避免这样创建嵌套列表
# wrong_matrix = [[0] * 3] * 3 # 所有行都是同一个对象的引用
# 6. 从其他可迭代对象创建
char_list = list("hello")
print(f"字符列表: {char_list}")
# 7. 使用生成器创建
def fibonacci(n):
a, b = 0, 1
for _ in range(n):
yield a
a, b = b, a + b
fib_list = list(fibonacci(10))
print(f"斐波那契数列: {fib_list}")
列表的高级操作
# 列表的切片操作
numbers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# 基本切片
print(f"前5个元素: {numbers[:5]}")
print(f"后5个元素: {numbers[-5:]}")
print(f"中间元素: {numbers[2:8]}")
print(f"每隔一个: {numbers[::2]}")
print(f"反转列表: {numbers[::-1]}")
# 切片赋值
numbers[2:5] = [20, 30, 40]
print(f"切片赋值后: {numbers}")
# 插入多个元素
numbers[2:2] = [15, 25]
print(f"插入元素后: {numbers}")
# 删除元素
del numbers[2:4]
print(f"删除元素后: {numbers}")
# 列表的方法
fruits = ["apple", "banana", "orange", "apple"]
# 添加元素
fruits.append("grape") # 末尾添加
fruits.insert(1, "kiwi") # 指定位置插入
fruits.extend(["mango", "pear"]) # 扩展列表
print(f"添加后: {fruits}")
# 删除元素
fruits.remove("apple") # 删除第一个匹配的元素
popped = fruits.pop() # 删除并返回最后一个元素
popped_index = fruits.pop(1) # 删除并返回指定索引的元素
print(f"删除后: {fruits}")
print(f"弹出的元素: {popped}, {popped_index}")
# 查找和计数
fruits = ["apple", "banana", "orange", "apple", "grape"]
print(f"apple的索引: {fruits.index('apple')}")
print(f"apple的数量: {fruits.count('apple')}")
# 排序和反转
numbers = [3, 1, 4, 1, 5, 9, 2, 6]
numbers.sort() # 原地排序
print(f"排序后: {numbers}")
numbers.reverse() # 原地反转
print(f"反转后: {numbers}")
# 不修改原列表的排序
original = [3, 1, 4, 1, 5, 9, 2, 6]
sorted_list = sorted(original) # 返回新列表
reversed_list = list(reversed(original)) # 返回新列表
print(f"原列表: {original}")
print(f"排序的新列表: {sorted_list}")
print(f"反转的新列表: {reversed_list}")
# 自定义排序
students = [
{"name": "Alice", "age": 20, "grade": 85},
{"name": "Bob", "age": 19, "grade": 92},
{"name": "Charlie", "age": 21, "grade": 78}
]
# 按年龄排序
students_by_age = sorted(students, key=lambda x: x["age"])
print("按年龄排序:")
for student in students_by_age:
print(f" {student['name']}: {student['age']}岁")
# 按成绩降序排序
students_by_grade = sorted(students, key=lambda x: x["grade"], reverse=True)
print("按成绩降序排序:")
for student in students_by_grade:
print(f" {student['name']}: {student['grade']}分")
# 多条件排序
students_multi = sorted(students, key=lambda x: (-x["grade"], x["age"]))
print("按成绩降序,年龄升序排序:")
for student in students_multi:
print(f" {student['name']}: {student['grade']}分, {student['age']}岁")
列表的性能优化
import time
import sys
from collections import deque
# 列表vs双端队列性能比较
def performance_comparison():
n = 100000
# 列表在头部插入(性能差)
start_time = time.time()
lst = []
for i in range(n):
lst.insert(0, i)
list_time = time.time() - start_time
# 双端队列在头部插入(性能好)
start_time = time.time()
dq = deque()
for i in range(n):
dq.appendleft(i)
deque_time = time.time() - start_time
print(f"列表头部插入{n}个元素耗时: {list_time:.4f}秒")
print(f"双端队列头部插入{n}个元素耗时: {deque_time:.4f}秒")
print(f"性能提升: {list_time/deque_time:.1f}倍")
# performance_comparison()
# 内存使用优化
def memory_optimization():
# 预分配空间
n = 1000000
# 方法1:逐个添加(内存重新分配多次)
start_time = time.time()
lst1 = []
for i in range(n):
lst1.append(i)
time1 = time.time() - start_time
# 方法2:预分配空间
start_time = time.time()
lst2 = [None] * n
for i in range(n):
lst2[i] = i
time2 = time.time() - start_time
# 方法3:使用列表推导式
start_time = time.time()
lst3 = [i for i in range(n)]
time3 = time.time() - start_time
print(f"逐个添加耗时: {time1:.4f}秒")
print(f"预分配空间耗时: {time2:.4f}秒")
print(f"列表推导式耗时: {time3:.4f}秒")
# 内存使用
print(f"列表内存使用: {sys.getsizeof(lst1)} 字节")
# memory_optimization()
# 列表的深拷贝和浅拷贝
import copy
original = [[1, 2, 3], [4, 5, 6]]
# 浅拷贝
shallow_copy = original.copy() # 或者 original[:] 或 list(original)
shallow_copy[0][0] = 999
print(f"原列表: {original}") # 内部列表被修改
print(f"浅拷贝: {shallow_copy}")
# 深拷贝
original = [[1, 2, 3], [4, 5, 6]]
deep_copy = copy.deepcopy(original)
deep_copy[0][0] = 999
print(f"原列表: {original}") # 内部列表未被修改
print(f"深拷贝: {deep_copy}")
# 列表的内存视图
numbers = [1, 2, 3, 4, 5]
view = memoryview(bytearray(numbers))
print(f"内存视图: {list(view)}")
列表的高级应用
# 列表的分组和分块
def chunk_list(lst, chunk_size):
"""将列表分成指定大小的块"""
return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
numbers = list(range(1, 21))
chunks = chunk_list(numbers, 5)
print(f"分块结果: {chunks}")
# 列表的扁平化
def flatten_list(nested_list):
"""扁平化嵌套列表"""
result = []
for item in nested_list:
if isinstance(item, list):
result.extend(flatten_list(item))
else:
result.append(item)
return result
nested = [1, [2, 3], [4, [5, 6]], 7]
flat = flatten_list(nested)
print(f"扁平化结果: {flat}")
# 使用itertools.chain扁平化
from itertools import chain
nested_simple = [[1, 2], [3, 4], [5, 6]]
flat_chain = list(chain.from_iterable(nested_simple))
print(f"chain扁平化: {flat_chain}")
# 列表的去重(保持顺序)
def remove_duplicates(lst):
"""去重并保持原顺序"""
seen = set()
result = []
for item in lst:
if item not in seen:
seen.add(item)
result.append(item)
return result
with_duplicates = [1, 2, 3, 2, 4, 1, 5]
unique = remove_duplicates(with_duplicates)
print(f"去重结果: {unique}")
# 使用dict.fromkeys去重(Python 3.7+保持顺序)
unique_dict = list(dict.fromkeys(with_duplicates))
print(f"dict去重结果: {unique_dict}")
# 列表的交集、并集、差集
list1 = [1, 2, 3, 4, 5]
list2 = [4, 5, 6, 7, 8]
# 交集
intersection = [x for x in list1 if x in list2]
print(f"交集: {intersection}")
# 并集(去重)
union = list(set(list1) | set(list2))
print(f"并集: {sorted(union)}")
# 差集
difference = [x for x in list1 if x not in list2]
print(f"差集: {difference}")
# 列表的旋转
def rotate_list(lst, k):
"""向右旋转列表k位"""
if not lst or k == 0:
return lst
k = k % len(lst)
return lst[-k:] + lst[:-k]
numbers = [1, 2, 3, 4, 5]
rotated = rotate_list(numbers, 2)
print(f"旋转2位: {rotated}")
# 列表的滑动窗口
def sliding_window(lst, window_size):
"""生成滑动窗口"""
for i in range(len(lst) - window_size + 1):
yield lst[i:i + window_size]
numbers = [1, 2, 3, 4, 5, 6, 7]
windows = list(sliding_window(numbers, 3))
print(f"滑动窗口(大小3): {windows}")
5.2 元组和命名元组
元组的基础操作
# 元组的创建
# 1. 使用圆括号
point = (3, 4)
print(f"坐标点: {point}")
# 2. 不使用圆括号(但推荐使用)
color = 255, 128, 0
print(f"颜色: {color}")
# 3. 单元素元组(注意逗号)
single = (42,) # 或者 42,
print(f"单元素元组: {single}")
print(f"类型: {type(single)}")
# 4. 空元组
empty = ()
print(f"空元组: {empty}")
# 5. 使用tuple()构造函数
from_list = tuple([1, 2, 3])
from_string = tuple("hello")
print(f"从列表创建: {from_list}")
print(f"从字符串创建: {from_string}")
# 元组的不可变性
point = (3, 4)
# point[0] = 5 # 这会引发TypeError
# 但是元组中的可变对象可以修改
data = ([1, 2], [3, 4])
data[0].append(3) # 修改列表内容
print(f"修改后的元组: {data}")
# 元组的操作
numbers = (1, 2, 3, 4, 5)
# 索引和切片
print(f"第一个元素: {numbers[0]}")
print(f"最后一个元素: {numbers[-1]}")
print(f"前三个元素: {numbers[:3]}")
# 长度和成员检查
print(f"元组长度: {len(numbers)}")
print(f"3在元组中: {3 in numbers}")
# 计数和查找
data = (1, 2, 3, 2, 4, 2, 5)
print(f"2的数量: {data.count(2)}")
print(f"2的第一个索引: {data.index(2)}")
# 元组的连接和重复
tuple1 = (1, 2, 3)
tuple2 = (4, 5, 6)
combined = tuple1 + tuple2
repeated = tuple1 * 3
print(f"连接: {combined}")
print(f"重复: {repeated}")
# 元组的解包
point = (3, 4)
x, y = point
print(f"x: {x}, y: {y}")
# 多重赋值
a, b, c = 1, 2, 3
print(f"a: {a}, b: {b}, c: {c}")
# 交换变量
a, b = b, a
print(f"交换后 a: {a}, b: {b}")
# 扩展解包(Python 3+)
numbers = (1, 2, 3, 4, 5)
first, *middle, last = numbers
print(f"第一个: {first}")
print(f"中间: {middle}")
print(f"最后一个: {last}")
# 忽略某些值
data = ("Alice", 25, "Engineer", "Beijing")
name, age, *_ = data
print(f"姓名: {name}, 年龄: {age}")
命名元组
from collections import namedtuple
from typing import NamedTuple
# 使用namedtuple创建
Point = namedtuple('Point', ['x', 'y'])
point = Point(3, 4)
print(f"点坐标: {point}")
print(f"x坐标: {point.x}")
print(f"y坐标: {point.y}")
# 使用字符串定义字段
Person = namedtuple('Person', 'name age city')
person = Person('Alice', 25, 'Beijing')
print(f"个人信息: {person}")
# 命名元组的方法
print(f"字段名: {person._fields}")
print(f"转为字典: {person._asdict()}")
# 替换字段值(返回新对象)
new_person = person._replace(age=26)
print(f"更新年龄后: {new_person}")
# 从可迭代对象创建
data = ['Bob', 30, 'Shanghai']
person2 = Person._make(data)
print(f"从列表创建: {person2}")
# 设置默认值
Student = namedtuple('Student', ['name', 'age', 'grade'], defaults=[0])
student1 = Student('Charlie', 20) # grade使用默认值
student2 = Student('David', 21, 85)
print(f"学生1: {student1}")
print(f"学生2: {student2}")
# 使用typing.NamedTuple(推荐,支持类型注解)
class Employee(NamedTuple):
name: str
age: int
department: str
salary: float = 0.0 # 默认值
def get_annual_salary(self) -> float:
return self.salary * 12
def __str__(self) -> str:
return f"{self.name} ({self.age}岁) - {self.department}部门"
emp = Employee('Alice', 28, 'IT', 8000)
print(f"员工信息: {emp}")
print(f"年薪: {emp.get_annual_salary()}")
# 命名元组的应用场景
# 1. 函数返回多个值
def get_circle_info(radius):
import math
CircleInfo = namedtuple('CircleInfo', ['radius', 'area', 'circumference'])
area = math.pi * radius ** 2
circumference = 2 * math.pi * radius
return CircleInfo(radius, area, circumference)
circle = get_circle_info(5)
print(f"圆的信息: 半径={circle.radius}, 面积={circle.area:.2f}, 周长={circle.circumference:.2f}")
# 2. 配置对象
Config = namedtuple('Config', ['host', 'port', 'debug', 'timeout'])
config = Config('localhost', 8080, True, 30)
print(f"配置: {config}")
# 3. 数据记录
LogEntry = namedtuple('LogEntry', ['timestamp', 'level', 'message'])
from datetime import datetime
log = LogEntry(datetime.now(), 'INFO', '应用启动成功')
print(f"日志: [{log.timestamp}] {log.level}: {log.message}")
# 4. 坐标和向量运算
class Vector(NamedTuple):
x: float
y: float
def __add__(self, other):
return Vector(self.x + other.x, self.y + other.y)
def __mul__(self, scalar):
return Vector(self.x * scalar, self.y * scalar)
def magnitude(self):
return (self.x ** 2 + self.y ** 2) ** 0.5
def __str__(self):
return f"Vector({self.x}, {self.y})"
v1 = Vector(3, 4)
v2 = Vector(1, 2)
v3 = v1 + v2
v4 = v1 * 2
print(f"向量1: {v1}")
print(f"向量2: {v2}")
print(f"向量相加: {v3}")
print(f"向量乘标量: {v4}")
print(f"向量1的模: {v1.magnitude()}")
5.3 字典的高级用法
字典的创建和基本操作
# 字典的创建方式
# 1. 字面量语法
student = {'name': 'Alice', 'age': 20, 'grade': 85}
print(f"学生信息: {student}")
# 2. dict()构造函数
student2 = dict(name='Bob', age=21, grade=90)
print(f"学生2: {student2}")
# 3. 从键值对列表创建
pairs = [('name', 'Charlie'), ('age', 19), ('grade', 88)]
student3 = dict(pairs)
print(f"学生3: {student3}")
# 4. 从两个列表创建
keys = ['name', 'age', 'grade']
values = ['David', 22, 92]
student4 = dict(zip(keys, values))
print(f"学生4: {student4}")
# 5. 字典推导式
squares = {x: x**2 for x in range(1, 6)}
print(f"平方数字典: {squares}")
# 6. 使用dict.fromkeys()
default_scores = dict.fromkeys(['math', 'english', 'science'], 0)
print(f"默认分数: {default_scores}")
# 字典的基本操作
student = {'name': 'Alice', 'age': 20, 'grade': 85}
# 访问元素
print(f"姓名: {student['name']}")
print(f"年龄: {student.get('age', '未知')}")
print(f"城市: {student.get('city', '未知')}")
# 修改和添加
student['age'] = 21
student['city'] = 'Beijing'
print(f"修改后: {student}")
# 删除元素
del student['grade']
popped_value = student.pop('city', '默认值')
print(f"删除后: {student}")
print(f"弹出的值: {popped_value}")
# 清空字典
temp_dict = {'a': 1, 'b': 2}
temp_dict.clear()
print(f"清空后: {temp_dict}")
# 字典的视图对象
data = {'a': 1, 'b': 2, 'c': 3}
keys_view = data.keys()
values_view = data.values()
items_view = data.items()
print(f"键视图: {list(keys_view)}")
print(f"值视图: {list(values_view)}")
print(f"项视图: {list(items_view)}")
# 视图是动态的
data['d'] = 4
print(f"添加元素后的键视图: {list(keys_view)}")
字典的高级操作
# 字典的合并
dict1 = {'a': 1, 'b': 2}
dict2 = {'c': 3, 'd': 4}
dict3 = {'b': 20, 'e': 5}
# 方法1: update()
merged1 = dict1.copy()
merged1.update(dict2)
print(f"update合并: {merged1}")
# 方法2: ** 解包(Python 3.5+)
merged2 = {**dict1, **dict2, **dict3}
print(f"解包合并: {merged2}")
# 方法3: | 操作符(Python 3.9+)
# merged3 = dict1 | dict2 | dict3
# print(f"操作符合并: {merged3}")
# 字典的setdefault方法
counter = {}
words = ['apple', 'banana', 'apple', 'cherry', 'banana', 'apple']
for word in words:
counter.setdefault(word, 0)
counter[word] += 1
print(f"词频统计: {counter}")
# 使用defaultdict
from collections import defaultdict
# 自动创建默认值
counter2 = defaultdict(int)
for word in words:
counter2[word] += 1
print(f"defaultdict词频: {dict(counter2)}")
# 嵌套字典的defaultdict
nested_dict = defaultdict(lambda: defaultdict(int))
nested_dict['fruits']['apple'] = 5
nested_dict['fruits']['banana'] = 3
nested_dict['vegetables']['carrot'] = 2
print(f"嵌套字典: {dict(nested_dict)}")
# 字典的排序
students = {
'Alice': 85,
'Bob': 92,
'Charlie': 78,
'David': 96
}
# 按键排序
sorted_by_key = dict(sorted(students.items()))
print(f"按姓名排序: {sorted_by_key}")
# 按值排序
sorted_by_value = dict(sorted(students.items(), key=lambda x: x[1], reverse=True))
print(f"按成绩排序: {sorted_by_value}")
# 获取最高分和最低分
best_student = max(students.items(), key=lambda x: x[1])
worst_student = min(students.items(), key=lambda x: x[1])
print(f"最高分: {best_student}")
print(f"最低分: {worst_student}")
# 字典的过滤
high_scores = {name: score for name, score in students.items() if score >= 90}
print(f"高分学生: {high_scores}")
# 字典的反转
reversed_dict = {v: k for k, v in students.items()}
print(f"反转字典: {reversed_dict}")
# 多级字典操作
company = {
'IT': {
'Alice': {'salary': 8000, 'level': 'senior'},
'Bob': {'salary': 6000, 'level': 'junior'}
},
'HR': {
'Charlie': {'salary': 7000, 'level': 'senior'},
'David': {'salary': 5000, 'level': 'junior'}
}
}
# 安全访问嵌套字典
def safe_get(dictionary, *keys, default=None):
"""安全获取嵌套字典的值"""
for key in keys:
if isinstance(dictionary, dict) and key in dictionary:
dictionary = dictionary[key]
else:
return default
return dictionary
alice_salary = safe_get(company, 'IT', 'Alice', 'salary')
nonexistent = safe_get(company, 'Finance', 'Eve', 'salary', default=0)
print(f"Alice的薪水: {alice_salary}")
print(f"不存在的值: {nonexistent}")
# 扁平化嵌套字典
def flatten_dict(d, parent_key='', sep='.'):
"""扁平化嵌套字典"""
items = []
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(flatten_dict(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
flat_company = flatten_dict(company)
print("扁平化公司数据:")
for key, value in flat_company.items():
print(f" {key}: {value}")
字典的特殊用法
# 使用字典实现switch语句
def handle_operation(operation, x, y):
operations = {
'add': lambda a, b: a + b,
'subtract': lambda a, b: a - b,
'multiply': lambda a, b: a * b,
'divide': lambda a, b: a / b if b != 0 else None
}
func = operations.get(operation)
if func:
return func(x, y)
else:
return "未知操作"
print(f"加法: {handle_operation('add', 5, 3)}")
print(f"除法: {handle_operation('divide', 10, 2)}")
print(f"未知: {handle_operation('unknown', 1, 2)}")
# 字典作为缓存
class FibonacciCache:
def __init__(self):
self.cache = {0: 0, 1: 1}
def fibonacci(self, n):
if n in self.cache:
return self.cache[n]
result = self.fibonacci(n-1) + self.fibonacci(n-2)
self.cache[n] = result
return result
def get_cache_size(self):
return len(self.cache)
fib_calc = FibonacciCache()
print(f"斐波那契数列第20项: {fib_calc.fibonacci(20)}")
print(f"缓存大小: {fib_calc.get_cache_size()}")
# 使用functools.lru_cache装饰器
from functools import lru_cache
@lru_cache(maxsize=128)
def fibonacci_cached(n):
if n < 2:
return n
return fibonacci_cached(n-1) + fibonacci_cached(n-2)
print(f"缓存版斐波那契第30项: {fibonacci_cached(30)}")
print(f"缓存信息: {fibonacci_cached.cache_info()}")
# 字典的弱引用
import weakref
class DataStore:
def __init__(self):
self._data = weakref.WeakValueDictionary()
def store(self, key, obj):
self._data[key] = obj
def get(self, key):
return self._data.get(key)
def keys(self):
return list(self._data.keys())
class TempObject:
def __init__(self, value):
self.value = value
def __del__(self):
print(f"对象 {self.value} 被删除")
store = DataStore()
obj1 = TempObject("test1")
obj2 = TempObject("test2")
store.store("key1", obj1)
store.store("key2", obj2)
print(f"存储的键: {store.keys()}")
# 删除强引用
del obj1
print(f"删除obj1后的键: {store.keys()}")
# 字典的内存优化(__slots__)
class RegularClass:
def __init__(self, x, y):
self.x = x
self.y = y
class SlottedClass:
__slots__ = ['x', 'y']
def __init__(self, x, y):
self.x = x
self.y = y
import sys
regular = RegularClass(1, 2)
slotted = SlottedClass(1, 2)
print(f"普通类实例大小: {sys.getsizeof(regular)} + {sys.getsizeof(regular.__dict__)} = {sys.getsizeof(regular) + sys.getsizeof(regular.__dict__)}")
print(f"__slots__类实例大小: {sys.getsizeof(slotted)}")
5.4 集合的操作和应用
集合的基础操作
# 集合的创建
# 1. 使用花括号
fruits = {'apple', 'banana', 'orange'}
print(f"水果集合: {fruits}")
# 2. 使用set()构造函数
numbers = set([1, 2, 3, 4, 5])
print(f"数字集合: {numbers}")
# 3. 从字符串创建
chars = set('hello')
print(f"字符集合: {chars}")
# 4. 空集合(注意不能用{})
empty_set = set()
print(f"空集合: {empty_set}")
print(f"空字典: {}")
# 集合的特性:无序、唯一
data = [1, 2, 2, 3, 3, 3, 4, 5]
unique_data = set(data)
print(f"去重后: {unique_data}")
# 集合的基本操作
fruits = {'apple', 'banana', 'orange'}
# 添加元素
fruits.add('grape')
print(f"添加grape后: {fruits}")
# 添加多个元素
fruits.update(['kiwi', 'mango'])
print(f"添加多个元素后: {fruits}")
# 删除元素
fruits.remove('banana') # 如果元素不存在会抛出KeyError
fruits.discard('pear') # 如果元素不存在不会抛出异常
popped = fruits.pop() # 随机删除一个元素
print(f"删除操作后: {fruits}")
print(f"弹出的元素: {popped}")
# 清空集合
temp_set = {'a', 'b', 'c'}
temp_set.clear()
print(f"清空后: {temp_set}")
# 成员检查
fruits = {'apple', 'banana', 'orange'}
print(f"apple在集合中: {'apple' in fruits}")
print(f"grape在集合中: {'grape' in fruits}")
# 集合长度
print(f"集合大小: {len(fruits)}")
集合运算
# 集合运算
set1 = {1, 2, 3, 4, 5}
set2 = {4, 5, 6, 7, 8}
set3 = {1, 2, 3}
# 并集(union)
union1 = set1 | set2
union2 = set1.union(set2)
print(f"并集: {union1}")
print(f"并集(方法): {union2}")
# 交集(intersection)
intersection1 = set1 & set2
intersection2 = set1.intersection(set2)
print(f"交集: {intersection1}")
print(f"交集(方法): {intersection2}")
# 差集(difference)
difference1 = set1 - set2
difference2 = set1.difference(set2)
print(f"差集(set1-set2): {difference1}")
print(f"差集(方法): {difference2}")
# 对称差集(symmetric_difference)
sym_diff1 = set1 ^ set2
sym_diff2 = set1.symmetric_difference(set2)
print(f"对称差集: {sym_diff1}")
print(f"对称差集(方法): {sym_diff2}")
# 子集和超集检查
print(f"set3是set1的子集: {set3.issubset(set1)}")
print(f"set1是set3的超集: {set1.issuperset(set3)}")
print(f"set1和set2不相交: {set1.isdisjoint(set2)}")
# 就地运算(修改原集合)
original = {1, 2, 3}
print(f"原始集合: {original}")
original |= {4, 5} # 就地并集
print(f"就地并集后: {original}")
original &= {1, 2, 3, 4} # 就地交集
print(f"就地交集后: {original}")
original -= {1} # 就地差集
print(f"就地差集后: {original}")
original ^= {2, 5, 6} # 就地对称差集
print(f"就地对称差集后: {original}")
# 多个集合运算
sets = [{1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {4, 5, 6}]
# 所有集合的并集
all_union = set().union(*sets)
print(f"所有集合的并集: {all_union}")
# 所有集合的交集
all_intersection = set.intersection(*sets)
print(f"所有集合的交集: {all_intersection}")
# 使用reduce进行复杂运算
from functools import reduce
# 逐步并集
step_union = reduce(lambda x, y: x | y, sets)
print(f"逐步并集: {step_union}")
# 逐步交集
step_intersection = reduce(lambda x, y: x & y, sets)
print(f"逐步交集: {step_intersection}")
集合的高级应用
# 数据去重和清洗
def clean_data(data_list):
"""清洗数据:去重、去空值、转换类型"""
# 去除None和空字符串
cleaned = [item for item in data_list if item is not None and item != '']
# 转换为字符串并去重
unique_strings = set(str(item).strip() for item in cleaned)
# 去除空字符串
unique_strings.discard('')
return sorted(unique_strings)
raw_data = [1, 2, '2', 3, None, '', ' 3 ', 4, 4, '1']
cleaned = clean_data(raw_data)
print(f"原始数据: {raw_data}")
print(f"清洗后: {cleaned}")
# 权限管理系统
class PermissionManager:
def __init__(self):
self.user_permissions = {}
self.role_permissions = {
'admin': {'read', 'write', 'delete', 'execute'},
'editor': {'read', 'write'},
'viewer': {'read'}
}
def assign_role(self, user, role):
"""为用户分配角色"""
if role in self.role_permissions:
if user not in self.user_permissions:
self.user_permissions[user] = set()
self.user_permissions[user].update(self.role_permissions[role])
def grant_permission(self, user, permission):
"""为用户授予特定权限"""
if user not in self.user_permissions:
self.user_permissions[user] = set()
self.user_permissions[user].add(permission)
def revoke_permission(self, user, permission):
"""撤销用户权限"""
if user in self.user_permissions:
self.user_permissions[user].discard(permission)
def has_permission(self, user, permission):
"""检查用户是否有特定权限"""
return permission in self.user_permissions.get(user, set())
def get_user_permissions(self, user):
"""获取用户所有权限"""
return self.user_permissions.get(user, set()).copy()
def get_common_permissions(self, *users):
"""获取多个用户的共同权限"""
if not users:
return set()
common = self.user_permissions.get(users[0], set()).copy()
for user in users[1:]:
common &= self.user_permissions.get(user, set())
return common
def get_unique_permissions(self, user, *other_users):
"""获取用户独有的权限"""
user_perms = self.user_permissions.get(user, set())
others_perms = set()
for other_user in other_users:
others_perms |= self.user_permissions.get(other_user, set())
return user_perms - others_perms
# 使用权限管理系统
pm = PermissionManager()
# 分配角色
pm.assign_role('alice', 'admin')
pm.assign_role('bob', 'editor')
pm.assign_role('charlie', 'viewer')
# 授予额外权限
pm.grant_permission('bob', 'execute')
pm.grant_permission('charlie', 'write')
print(f"Alice的权限: {pm.get_user_permissions('alice')}")
print(f"Bob的权限: {pm.get_user_permissions('bob')}")
print(f"Charlie的权限: {pm.get_user_permissions('charlie')}")
# 检查权限
print(f"Bob有写权限: {pm.has_permission('bob', 'write')}")
print(f"Charlie有删除权限: {pm.has_permission('charlie', 'delete')}")
# 共同权限和独有权限
common = pm.get_common_permissions('alice', 'bob', 'charlie')
unique_alice = pm.get_unique_permissions('alice', 'bob', 'charlie')
print(f"三人共同权限: {common}")
print(f"Alice独有权限: {unique_alice}")
# 标签系统
class TagSystem:
def __init__(self):
self.item_tags = {} # 物品到标签的映射
self.tag_items = {} # 标签到物品的映射
def add_tags(self, item, *tags):
"""为物品添加标签"""
if item not in self.item_tags:
self.item_tags[item] = set()
for tag in tags:
self.item_tags[item].add(tag)
if tag not in self.tag_items:
self.tag_items[tag] = set()
self.tag_items[tag].add(item)
def remove_tags(self, item, *tags):
"""移除物品的标签"""
if item in self.item_tags:
for tag in tags:
self.item_tags[item].discard(tag)
if tag in self.tag_items:
self.tag_items[tag].discard(item)
if not self.tag_items[tag]:
del self.tag_items[tag]
def get_items_by_tags(self, *tags, mode='any'):
"""根据标签获取物品"""
if not tags:
return set()
if mode == 'any':
# 包含任意一个标签的物品
result = set()
for tag in tags:
result |= self.tag_items.get(tag, set())
return result
elif mode == 'all':
# 包含所有标签的物品
result = self.tag_items.get(tags[0], set()).copy()
for tag in tags[1:]:
result &= self.tag_items.get(tag, set())
return result
else:
raise ValueError("mode必须是'any'或'all'")
def get_related_items(self, item):
"""获取相关物品(有共同标签的物品)"""
if item not in self.item_tags:
return set()
item_tags = self.item_tags[item]
related = set()
for tag in item_tags:
related |= self.tag_items.get(tag, set())
related.discard(item) # 移除自己
return related
def get_tag_similarity(self, item1, item2):
"""计算两个物品的标签相似度"""
tags1 = self.item_tags.get(item1, set())
tags2 = self.item_tags.get(item2, set())
if not tags1 and not tags2:
return 0.0
intersection = len(tags1 & tags2)
union = len(tags1 | tags2)
return intersection / union if union > 0 else 0.0
# 使用标签系统
tag_sys = TagSystem()
# 添加物品和标签
tag_sys.add_tags('Python教程', 'programming', 'python', 'tutorial', 'beginner')
tag_sys.add_tags('Java教程', 'programming', 'java', 'tutorial', 'beginner')
tag_sys.add_tags('机器学习', 'programming', 'python', 'ai', 'advanced')
tag_sys.add_tags('数据结构', 'programming', 'algorithm', 'computer-science')
tag_sys.add_tags('Web开发', 'programming', 'web', 'javascript', 'html')
# 查询
python_items = tag_sys.get_items_by_tags('python')
print(f"Python相关物品: {python_items}")
beginner_programming = tag_sys.get_items_by_tags('programming', 'beginner', mode='all')
print(f"编程入门物品: {beginner_programming}")
# 相关物品推荐
related_to_python = tag_sys.get_related_items('Python教程')
print(f"Python教程相关物品: {related_to_python}")
# 相似度计算
similarity = tag_sys.get_tag_similarity('Python教程', 'Java教程')
print(f"Python教程和Java教程的相似度: {similarity:.2f}")
冻结集合(frozenset)
# frozenset的创建和使用
regular_set = {1, 2, 3, 4, 5}
frozen_set = frozenset([1, 2, 3, 4, 5])
print(f"普通集合: {regular_set}")
print(f"冻结集合: {frozen_set}")
# frozenset是不可变的
# frozen_set.add(6) # 这会引发AttributeError
# frozenset可以作为字典的键
set_dict = {
frozenset([1, 2]): 'group1',
frozenset([3, 4]): 'group2',
frozenset([5, 6]): 'group3'
}
print(f"以frozenset为键的字典: {set_dict}")
# frozenset可以作为集合的元素
set_of_sets = {
frozenset([1, 2, 3]),
frozenset([4, 5, 6]),
frozenset([7, 8, 9])
}
print(f"集合的集合: {set_of_sets}")
# 图的表示(使用frozenset表示边)
class Graph:
def __init__(self):
self.edges = set()
self.vertices = set()
def add_edge(self, vertex1, vertex2):
"""添加无向边"""
edge = frozenset([vertex1, vertex2])
self.edges.add(edge)
self.vertices.add(vertex1)
self.vertices.add(vertex2)
def has_edge(self, vertex1, vertex2):
"""检查是否存在边"""
edge = frozenset([vertex1, vertex2])
return edge in self.edges
def get_neighbors(self, vertex):
"""获取顶点的邻居"""
neighbors = set()
for edge in self.edges:
if vertex in edge:
neighbors.update(edge - {vertex})
return neighbors
def __str__(self):
return f"Graph(vertices={self.vertices}, edges={self.edges})"
# 使用图
graph = Graph()
graph.add_edge('A', 'B')
graph.add_edge('B', 'C')
graph.add_edge('C', 'A')
graph.add_edge('A', 'D')
print(graph)
print(f"A的邻居: {graph.get_neighbors('A')}")
print(f"A和C之间有边: {graph.has_edge('A', 'C')}")
print(f"A和E之间有边: {graph.has_edge('A', 'E')}")
5.5 字符串的深入处理
字符串的创建和基本操作
# 字符串的创建方式
# 1. 单引号和双引号
single_quote = 'Hello, World!'
double_quote = "Hello, World!"
print(f"单引号: {single_quote}")
print(f"双引号: {double_quote}")
# 2. 三引号(多行字符串)
multi_line = """
这是一个
多行字符串
可以包含换行符
"""
print(f"多行字符串: {multi_line}")
# 3. 原始字符串(r前缀)
raw_string = r"C:\Users\name\Documents\file.txt"
print(f"原始字符串: {raw_string}")
# 4. 格式化字符串(f前缀)
name = "Alice"
age = 25
f_string = f"我的名字是{name},今年{age}岁"
print(f"f字符串: {f_string}")
# 5. 字节字符串(b前缀)
byte_string = b"Hello, World!"
print(f"字节字符串: {byte_string}")
print(f"类型: {type(byte_string)}")
# 字符串的不可变性
original = "hello"
# original[0] = 'H' # 这会引发TypeError
modified = 'H' + original[1:] # 创建新字符串
print(f"原字符串: {original}")
print(f"修改后: {modified}")
# 字符串的基本操作
text = "Hello, World!"
# 长度
print(f"字符串长度: {len(text)}")
# 索引和切片
print(f"第一个字符: {text[0]}")
print(f"最后一个字符: {text[-1]}")
print(f"前5个字符: {text[:5]}")
print(f"后6个字符: {text[-6:]}")
print(f"每隔一个字符: {text[::2]}")
print(f"反转字符串: {text[::-1]}")
# 成员检查
print(f"'World'在字符串中: {'World' in text}")
print(f"'Python'在字符串中: {'Python' in text}")
# 字符串连接
first_name = "John"
last_name = "Doe"
full_name = first_name + " " + last_name
print(f"全名: {full_name}")
# 字符串重复
repeated = "Ha" * 5
print(f"重复字符串: {repeated}")
字符串方法详解
# 大小写转换
text = "Hello, World!"
print(f"小写: {text.lower()}")
print(f"大写: {text.upper()}")
print(f"首字母大写: {text.capitalize()}")
print(f"标题格式: {text.title()}")
print(f"大小写互换: {text.swapcase()}")
# 判断字符串类型
print(f"是否全为字母: {'Hello'.isalpha()}")
print(f"是否全为数字: {'12345'.isdigit()}")
print(f"是否全为字母数字: {'Hello123'.isalnum()}")
print(f"是否全为小写: {'hello'.islower()}")
print(f"是否全为大写: {'HELLO'.isupper()}")
print(f"是否为标题格式: {'Hello World'.istitle()}")
print(f"是否全为空白字符: {' '.isspace()}")
# 字符串查找和替换
text = "Python is great. Python is powerful."
# 查找
print(f"'Python'第一次出现的位置: {text.find('Python')}")
print(f"'Python'最后一次出现的位置: {text.rfind('Python')}")
print(f"'Python'出现的次数: {text.count('Python')}")
# 检查开头和结尾
print(f"以'Python'开头: {text.startswith('Python')}")
print(f"以'powerful.'结尾: {text.endswith('powerful.')}")
# 替换
replaced = text.replace('Python', 'Java')
print(f"替换后: {replaced}")
# 限制替换次数
replaced_once = text.replace('Python', 'Java', 1)
print(f"只替换一次: {replaced_once}")
# 字符串分割和连接
sentence = "apple,banana,orange,grape"
# 分割
fruits = sentence.split(',')
print(f"分割结果: {fruits}")
# 按行分割
multi_line_text = "line1\nline2\nline3"
lines = multi_line_text.splitlines()
print(f"按行分割: {lines}")
# 限制分割次数
limited_split = sentence.split(',', 2)
print(f"限制分割次数: {limited_split}")
# 从右边分割
path = "/home/user/documents/file.txt"
dir_file = path.rsplit('/', 1)
print(f"目录和文件: {dir_file}")
# 连接
fruits_list = ['apple', 'banana', 'orange']
joined = ', '.join(fruits_list)
print(f"连接结果: {joined}")
# 连接数字列表(需要转换为字符串)
numbers = [1, 2, 3, 4, 5]
number_string = '-'.join(map(str, numbers))
print(f"数字连接: {number_string}")
# 字符串清理
text_with_spaces = " Hello, World! "
print(f"原文本: '{text_with_spaces}'")
print(f"去除两端空白: '{text_with_spaces.strip()}'")
print(f"去除左端空白: '{text_with_spaces.lstrip()}'")
print(f"去除右端空白: '{text_with_spaces.rstrip()}'")
# 去除指定字符
text_with_chars = "...Hello, World!..."
print(f"去除点号: '{text_with_chars.strip('.')}'")
# 字符串填充和对齐
text = "Python"
print(f"左对齐(20): '{text.ljust(20, '-')}'")
print(f"右对齐(20): '{text.rjust(20, '-')}'")
print(f"居中对齐(20): '{text.center(20, '-')}'")
print(f"零填充(10): '{text.zfill(10)}'")
# 数字的零填充
number = "42"
print(f"数字零填充: '{number.zfill(5)}'")
# 字符串编码和解码
text = "你好,世界!"
# 编码为字节
utf8_bytes = text.encode('utf-8')
gbk_bytes = text.encode('gbk')
print(f"UTF-8编码: {utf8_bytes}")
print(f"GBK编码: {gbk_bytes}")
# 解码为字符串
decoded_utf8 = utf8_bytes.decode('utf-8')
decoded_gbk = gbk_bytes.decode('gbk')
print(f"UTF-8解码: {decoded_utf8}")
print(f"GBK解码: {decoded_gbk}")
# 处理编码错误
malformed_bytes = b'\xff\xfe\x00\x00'
try:
decoded = malformed_bytes.decode('utf-8')
except UnicodeDecodeError as e:
print(f"解码错误: {e}")
# 忽略错误
decoded = malformed_bytes.decode('utf-8', errors='ignore')
print(f"忽略错误后: '{decoded}'")
# 替换错误
decoded = malformed_bytes.decode('utf-8', errors='replace')
print(f"替换错误后: '{decoded}'")
字符串格式化
# 1. % 格式化(旧式)
name = "Alice"
age = 25
score = 95.5
old_format = "姓名: %s, 年龄: %d, 分数: %.2f" % (name, age, score)
print(f"旧式格式化: {old_format}")
# 2. str.format() 方法
format_method = "姓名: {}, 年龄: {}, 分数: {:.2f}".format(name, age, score)
print(f"format方法: {format_method}")
# 位置参数
format_positional = "姓名: {0}, 年龄: {1}, 分数: {2:.2f}".format(name, age, score)
print(f"位置参数: {format_positional}")
# 关键字参数
format_keyword = "姓名: {name}, 年龄: {age}, 分数: {score:.2f}".format(
name=name, age=age, score=score
)
print(f"关键字参数: {format_keyword}")
# 3. f-string(推荐,Python 3.6+)
f_string = f"姓名: {name}, 年龄: {age}, 分数: {score:.2f}"
print(f"f-string: {f_string}")
# f-string中的表达式
import math
radius = 5
f_expression = f"半径为{radius}的圆的面积是{math.pi * radius**2:.2f}"
print(f"f-string表达式: {f_expression}")
# f-string中的函数调用
def get_greeting(name):
return f"Hello, {name}!"
f_function = f"问候语: {get_greeting('World')}"
print(f"f-string函数: {f_function}")
# 格式化规范
number = 1234.5678
print(f"默认: {number}")
print(f"保留2位小数: {number:.2f}")
print(f"科学计数法: {number:.2e}")
print(f"百分比: {number:.2%}")
print(f"千分位分隔符: {number:,.2f}")
print(f"右对齐(15位): '{number:>15.2f}'")
print(f"左对齐(15位): '{number:<15.2f}'")
print(f"居中对齐(15位): '{number:^15.2f}'")
print(f"零填充(10位): '{number:010.2f}'")
# 进制转换
num = 255
print(f"十进制: {num:d}")
print(f"二进制: {num:b}")
print(f"八进制: {num:o}")
print(f"十六进制: {num:x}")
print(f"十六进制(大写): {num:X}")
# 日期时间格式化
from datetime import datetime
now = datetime.now()
print(f"当前时间: {now}")
print(f"格式化时间: {now:%Y-%m-%d %H:%M:%S}")
print(f"简短日期: {now:%Y/%m/%d}")
print(f"时间: {now:%H:%M}")
# 字符串模板
from string import Template
template = Template("Hello, $name! You have $count new messages.")
result = template.substitute(name="Alice", count=5)
print(f"模板结果: {result}")
# 安全替换(缺少变量时不报错)
partial_result = template.safe_substitute(name="Bob")
print(f"部分替换: {partial_result}")
正则表达式
import re
# 基本匹配
text = "Hello, my phone number is 123-456-7890"
pattern = r"\d{3}-\d{3}-\d{4}"
match = re.search(pattern, text)
if match:
print(f"找到电话号码: {match.group()}")
# 查找所有匹配
text = "Contact us at 123-456-7890 or 987-654-3210"
phones = re.findall(pattern, text)
print(f"所有电话号码: {phones}")
# 分组匹配
email_pattern = r"([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})"
email_text = "Contact: alice@example.com or bob@test.org"
email_matches = re.findall(email_pattern, email_text)
print(f"邮箱匹配: {email_matches}")
# 命名分组
named_pattern = r"(?P<user>[a-zA-Z0-9._%+-]+)@(?P<domain>[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})"
match = re.search(named_pattern, "alice@example.com")
if match:
print(f"用户名: {match.group('user')}")
print(f"域名: {match.group('domain')}")
print(f"完整匹配: {match.groupdict()}")
# 替换
text = "The price is $100 and the tax is $15"
# 将美元符号替换为人民币符号
replaced = re.sub(r"\$", "¥", text)
print(f"替换后: {replaced}")
# 使用函数进行替换
def convert_currency(match):
amount = float(match.group(1))
return f"¥{amount * 6.5:.2f}"
text = "The price is $100.50 and the tax is $15.25"
converted = re.sub(r"\$(\d+\.\d+)", convert_currency, text)
print(f"货币转换: {converted}")
# 分割
text = "apple,banana;orange:grape"
fruits = re.split(r"[,;:]", text)
print(f"正则分割: {fruits}")
# 编译正则表达式(提高性能)
compiled_pattern = re.compile(r"\b\w+@\w+\.\w+\b")
text = "Emails: alice@test.com, bob@example.org, invalid-email"
emails = compiled_pattern.findall(text)
print(f"编译模式匹配: {emails}")
# 常用正则表达式模式
patterns = {
"邮箱": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
"电话": r"\d{3}-\d{3}-\d{4}",
"身份证": r"\d{17}[\dXx]",
"IP地址": r"\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b",
"URL": r"https?://[\w\.-]+\.[a-zA-Z]{2,}[\w\.-]*/?[\w\.-]*",
"中文": r"[\u4e00-\u9fff]+",
"数字": r"\d+",
"浮点数": r"\d+\.\d+"
}
test_text = """
联系方式:
邮箱:alice@example.com
电话:138-1234-5678
网站:https://www.example.com
IP:192.168.1.1
价格:99.99元
中文:你好世界
"""
for name, pattern in patterns.items():
matches = re.findall(pattern, test_text)
if matches:
print(f"{name}: {matches}")
字符串的高级应用
# 文本处理工具类
class TextProcessor:
def __init__(self):
self.stop_words = {'的', '了', '在', '是', '我', '有', '和', '就',
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at'}
def clean_text(self, text):
"""清理文本"""
import string
# 转换为小写
text = text.lower()
# 移除标点符号
text = text.translate(str.maketrans('', '', string.punctuation))
# 移除多余空白
text = ' '.join(text.split())
return text
def extract_words(self, text):
"""提取单词"""
cleaned = self.clean_text(text)
words = cleaned.split()
# 过滤停用词
return [word for word in words if word not in self.stop_words]
def word_frequency(self, text):
"""词频统计"""
words = self.extract_words(text)
freq = {}
for word in words:
freq[word] = freq.get(word, 0) + 1
return dict(sorted(freq.items(), key=lambda x: x[1], reverse=True))
def find_keywords(self, text, top_n=5):
"""提取关键词"""
freq = self.word_frequency(text)
return list(freq.keys())[:top_n]
def similarity(self, text1, text2):
"""计算文本相似度(简单版本)"""
words1 = set(self.extract_words(text1))
words2 = set(self.extract_words(text2))
intersection = len(words1 & words2)
union = len(words1 | words2)
return intersection / union if union > 0 else 0
def summarize(self, text, max_sentences=3):
"""简单文本摘要"""
sentences = text.split('。')
sentences = [s.strip() for s in sentences if s.strip()]
if len(sentences) <= max_sentences:
return text
# 简单评分:句子长度和关键词数量
word_freq = self.word_frequency(text)
top_words = set(list(word_freq.keys())[:10])
sentence_scores = []
for sentence in sentences:
words = self.extract_words(sentence)
score = len(words) + sum(1 for word in words if word in top_words)
sentence_scores.append((sentence, score))
# 选择得分最高的句子
top_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)[:max_sentences]
return '。'.join([s[0] for s in top_sentences]) + '。'
# 使用文本处理器
processor = TextProcessor()
sample_text = """
Python是一种高级编程语言。它具有简洁的语法和强大的功能。
Python广泛应用于Web开发、数据科学、人工智能等领域。
学习Python可以帮助你快速开发各种应用程序。
Python的社区非常活跃,有丰富的第三方库可以使用。
"""
print(f"原文本: {sample_text}")
print(f"清理后: {processor.clean_text(sample_text)}")
print(f"词频统计: {processor.word_frequency(sample_text)}")
print(f"关键词: {processor.find_keywords(sample_text)}")
print(f"摘要: {processor.summarize(sample_text, 2)}")
# 文本相似度
text1 = "Python是一种编程语言"
text2 = "Python是一种高级编程语言"
similarity = processor.similarity(text1, text2)
print(f"文本相似度: {similarity:.2f}")
# 字符串的内存优化
import sys
# 字符串驻留(string interning)
a = "hello"
b = "hello"
print(f"字符串驻留: {a is b}") # True
# 长字符串不会自动驻留
long_a = "hello" * 1000
long_b = "hello" * 1000
print(f"长字符串驻留: {long_a is long_b}") # False
# 手动驻留
import sys
long_a_interned = sys.intern(long_a)
long_b_interned = sys.intern(long_b)
print(f"手动驻留: {long_a_interned is long_b_interned}") # True
# 内存使用比较
print(f"普通字符串内存: {sys.getsizeof(long_a)}")
print(f"驻留字符串内存: {sys.getsizeof(long_a_interned)}")
# 字符串构建性能比较
import time
def string_concatenation(n):
"""字符串连接(性能差)"""
result = ""
for i in range(n):
result += str(i)
return result
def string_join(n):
"""使用join(性能好)"""
return ''.join(str(i) for i in range(n))
def string_format(n):
"""使用格式化"""
return ''.join(f"{i}" for i in range(n))
n = 10000
# 测试性能
start = time.time()
result1 = string_concatenation(n)
time1 = time.time() - start
start = time.time()
result2 = string_join(n)
time2 = time.time() - start
start = time.time()
result3 = string_format(n)
time3 = time.time() - start
print(f"字符串连接耗时: {time1:.4f}秒")
print(f"join方法耗时: {time2:.4f}秒")
print(f"格式化耗时: {time3:.4f}秒")
print(f"join比连接快: {time1/time2:.1f}倍")
5.6 数据结构的选择和性能比较
import time
import sys
from collections import deque, defaultdict, Counter
import random
# 性能测试函数
def performance_test(func, *args, **kwargs):
"""性能测试装饰器"""
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
return result, end_time - start_time
# 列表 vs 元组性能比较
def list_vs_tuple_performance():
print("=== 列表 vs 元组性能比较 ===")
n = 1000000
# 创建性能
_, list_create_time = performance_test(lambda: [i for i in range(n)])
_, tuple_create_time = performance_test(lambda: tuple(i for i in range(n)))
print(f"创建{n}个元素:")
print(f" 列表: {list_create_time:.4f}秒")
print(f" 元组: {tuple_create_time:.4f}秒")
# 访问性能
test_list = list(range(n))
test_tuple = tuple(range(n))
def access_test(container):
total = 0
for i in range(0, n, 1000):
total += container[i]
return total
_, list_access_time = performance_test(access_test, test_list)
_, tuple_access_time = performance_test(access_test, test_tuple)
print(f"随机访问:")
print(f" 列表: {list_access_time:.4f}秒")
print(f" 元组: {tuple_access_time:.4f}秒")
# 内存使用
print(f"内存使用:")
print(f" 列表: {sys.getsizeof(test_list)} 字节")
print(f" 元组: {sys.getsizeof(test_tuple)} 字节")
# 列表 vs 双端队列性能比较
def list_vs_deque_performance():
print("\n=== 列表 vs 双端队列性能比较 ===")
n = 100000
# 头部插入性能
def list_prepend():
lst = []
for i in range(n):
lst.insert(0, i)
return lst
def deque_prepend():
dq = deque()
for i in range(n):
dq.appendleft(i)
return dq
_, list_prepend_time = performance_test(list_prepend)
_, deque_prepend_time = performance_test(deque_prepend)
print(f"头部插入{n}个元素:")
print(f" 列表: {list_prepend_time:.4f}秒")
print(f" 双端队列: {deque_prepend_time:.4f}秒")
print(f" 性能提升: {list_prepend_time/deque_prepend_time:.1f}倍")
# 尾部插入性能
def list_append():
lst = []
for i in range(n):
lst.append(i)
return lst
def deque_append():
dq = deque()
for i in range(n):
dq.append(i)
return dq
_, list_append_time = performance_test(list_append)
_, deque_append_time = performance_test(deque_append)
print(f"尾部插入{n}个元素:")
print(f" 列表: {list_append_time:.4f}秒")
print(f" 双端队列: {deque_append_time:.4f}秒")
# 字典 vs 列表查找性能
def dict_vs_list_lookup():
print("\n=== 字典 vs 列表查找性能 ===")
n = 100000
# 创建测试数据
data_list = list(range(n))
data_dict = {i: i for i in range(n)}
search_items = random.sample(range(n), 1000)
# 列表查找
def list_lookup():
found = 0
for item in search_items:
if item in data_list:
found += 1
return found
# 字典查找
def dict_lookup():
found = 0
for item in search_items:
if item in data_dict:
found += 1
return found
_, list_lookup_time = performance_test(list_lookup)
_, dict_lookup_time = performance_test(dict_lookup)
print(f"查找1000个元素:")
print(f" 列表: {list_lookup_time:.4f}秒")
print(f" 字典: {dict_lookup_time:.4f}秒")
print(f" 性能提升: {list_lookup_time/dict_lookup_time:.1f}倍")
# 集合 vs 列表去重性能
def set_vs_list_dedup():
print("\n=== 集合 vs 列表去重性能 ===")
n = 100000
# 创建有重复的数据
data_with_duplicates = [random.randint(0, n//10) for _ in range(n)]
# 使用列表去重
def list_dedup():
unique = []
for item in data_with_duplicates:
if item not in unique:
unique.append(item)
return unique
# 使用集合去重
def set_dedup():
return list(set(data_with_duplicates))
# 使用字典去重(保持顺序)
def dict_dedup():
return list(dict.fromkeys(data_with_duplicates))
_, list_dedup_time = performance_test(list_dedup)
_, set_dedup_time = performance_test(set_dedup)
_, dict_dedup_time = performance_test(dict_dedup)
print(f"去重{n}个元素:")
print(f" 列表方法: {list_dedup_time:.4f}秒")
print(f" 集合方法: {set_dedup_time:.4f}秒")
print(f" 字典方法: {dict_dedup_time:.4f}秒")
print(f" 集合比列表快: {list_dedup_time/set_dedup_time:.1f}倍")
# 不同数据结构的内存使用
def memory_usage_comparison():
print("\n=== 内存使用比较 ===")
n = 10000
# 创建不同数据结构
data_list = list(range(n))
data_tuple = tuple(range(n))
data_set = set(range(n))
data_dict = {i: i for i in range(n)}
data_deque = deque(range(n))
print(f"{n}个整数的内存使用:")
print(f" 列表: {sys.getsizeof(data_list):,} 字节")
print(f" 元组: {sys.getsizeof(data_tuple):,} 字节")
print(f" 集合: {sys.getsizeof(data_set):,} 字节")
print(f" 字典: {sys.getsizeof(data_dict):,} 字节")
print(f" 双端队列: {sys.getsizeof(data_deque):,} 字节")
# 数据结构选择指南
def data_structure_guide():
print("\n=== 数据结构选择指南 ===")
guide = {
"列表 (list)": {
"适用场景": ["需要有序存储", "需要索引访问", "需要修改元素", "需要在尾部频繁添加/删除"],
"不适用场景": ["需要在头部频繁添加/删除", "需要快速查找", "不需要修改的数据"],
"时间复杂度": "访问O(1), 搜索O(n), 插入O(n), 删除O(n)"
},
"元组 (tuple)": {
"适用场景": ["不可变数据", "作为字典键", "函数返回多个值", "配置数据"],
"不适用场景": ["需要修改数据", "需要频繁添加/删除"],
"时间复杂度": "访问O(1), 搜索O(n)"
},
"字典 (dict)": {
"适用场景": ["键值对存储", "快速查找", "计数统计", "缓存数据"],
"不适用场景": ["需要有序存储(Python 3.7前)", "键不可哈希"],
"时间复杂度": "访问O(1), 搜索O(1), 插入O(1), 删除O(1)"
},
"集合 (set)": {
"适用场景": ["去重", "成员检查", "集合运算", "唯一性约束"],
"不适用场景": ["需要有序存储", "需要索引访问", "元素不可哈希"],
"时间复杂度": "搜索O(1), 插入O(1), 删除O(1)"
},
"双端队列 (deque)": {
"适用场景": ["队列操作", "栈操作", "滑动窗口", "头尾频繁操作"],
"不适用场景": ["需要随机访问", "需要索引操作"],
"时间复杂度": "两端操作O(1), 中间操作O(n)"
}
}
for structure, info in guide.items():
print(f"\n{structure}:")
print(f" 适用场景: {', '.join(info['适用场景'])}")
print(f" 不适用场景: {', '.join(info['不适用场景'])}")
print(f" 时间复杂度: {info['时间复杂度']}")
# 运行所有性能测试
if __name__ == "__main__":
list_vs_tuple_performance()
list_vs_deque_performance()
dict_vs_list_lookup()
set_vs_list_dedup()
memory_usage_comparison()
data_structure_guide()
5.7 综合示例:数据分析工具
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
数据分析工具
功能:
1. 数据加载和清洗
2. 统计分析
3. 数据可视化(文本形式)
4. 报告生成
作者: Python学习者
版本: 1.0
"""
import json
import csv
import statistics
from collections import defaultdict, Counter, namedtuple
from typing import List, Dict, Any, Optional, Union
import re
from datetime import datetime
# 数据记录类型
DataRecord = namedtuple('DataRecord', ['id', 'timestamp', 'category', 'value', 'metadata'])
class DataAnalyzer:
"""数据分析器"""
def __init__(self):
self.data: List[DataRecord] = []
self.categories = set()
self.date_range = None
def load_from_csv(self, filename: str) -> bool:
"""从CSV文件加载数据"""
try:
with open(filename, 'r', encoding='utf-8') as file:
reader = csv.DictReader(file)
for row in reader:
record = DataRecord(
id=row.get('id', ''),
timestamp=row.get('timestamp', ''),
category=row.get('category', ''),
value=float(row.get('value', 0)),
metadata=row.get('metadata', '{}')
)
self.data.append(record)
self.categories.add(record.category)
self._update_date_range()
print(f"成功加载 {len(self.data)} 条记录")
return True
except Exception as e:
print(f"加载CSV文件失败: {e}")
return False
def load_from_json(self, filename: str) -> bool:
"""从JSON文件加载数据"""
try:
with open(filename, 'r', encoding='utf-8') as file:
json_data = json.load(file)
for item in json_data:
record = DataRecord(
id=item.get('id', ''),
timestamp=item.get('timestamp', ''),
category=item.get('category', ''),
value=float(item.get('value', 0)),
metadata=json.dumps(item.get('metadata', {}))
)
self.data.append(record)
self.categories.add(record.category)
self._update_date_range()
print(f"成功加载 {len(self.data)} 条记录")
return True
except Exception as e:
print(f"加载JSON文件失败: {e}")
return False
def add_sample_data(self) -> None:
"""添加示例数据"""
import random
from datetime import datetime, timedelta
categories = ['销售', '市场', '技术', '客服', '财务']
base_date = datetime.now() - timedelta(days=30)
for i in range(1000):
timestamp = base_date + timedelta(days=random.randint(0, 30))
record = DataRecord(
id=f"REC{i:04d}",
timestamp=timestamp.strftime('%Y-%m-%d %H:%M:%S'),
category=random.choice(categories),
value=random.uniform(10, 1000),
metadata=json.dumps({
'source': random.choice(['web', 'mobile', 'api']),
'region': random.choice(['北京', '上海', '广州', '深圳'])
})
)
self.data.append(record)
self.categories.add(record.category)
self._update_date_range()
print(f"添加了 1000 条示例数据")
def _update_date_range(self) -> None:
"""更新日期范围"""
if not self.data:
return
timestamps = [record.timestamp for record in self.data if record.timestamp]
if timestamps:
self.date_range = (min(timestamps), max(timestamps))
def clean_data(self) -> Dict[str, int]:
"""清洗数据"""
original_count = len(self.data)
cleaned_data = []
# 统计清洗信息
stats = {
'原始记录': original_count,
'删除空值': 0,
'删除异常值': 0,
'删除重复': 0
}
# 去除空值和异常值
for record in self.data:
# 检查必要字段
if not record.id or not record.category:
stats['删除空值'] += 1
continue
# 检查数值范围
if record.value < 0 or record.value > 10000:
stats['删除异常值'] += 1
continue
cleaned_data.append(record)
# 去重(基于ID)
seen_ids = set()
final_data = []
for record in cleaned_data:
if record.id in seen_ids:
stats['删除重复'] += 1
continue
seen_ids.add(record.id)
final_data.append(record)
self.data = final_data
stats['清洗后记录'] = len(self.data)
# 重新计算类别和日期范围
self.categories = {record.category for record in self.data}
self._update_date_range()
return stats
def get_basic_stats(self) -> Dict[str, Any]:
"""获取基本统计信息"""
if not self.data:
return {}
values = [record.value for record in self.data]
return {
'记录总数': len(self.data),
'类别数量': len(self.categories),
'数值统计': {
'总和': sum(values),
'平均值': statistics.mean(values),
'中位数': statistics.median(values),
'最大值': max(values),
'最小值': min(values),
'标准差': statistics.stdev(values) if len(values) > 1 else 0
},
'日期范围': self.date_range
}
def get_category_stats(self) -> Dict[str, Dict[str, float]]:
"""获取分类统计"""
category_data = defaultdict(list)
for record in self.data:
category_data[record.category].append(record.value)
stats = {}
for category, values in category_data.items():
if values:
stats[category] = {
'数量': len(values),
'总和': sum(values),
'平均值': statistics.mean(values),
'最大值': max(values),
'最小值': min(values)
}
return stats
def get_time_series(self, group_by='day') -> Dict[str, float]:
"""获取时间序列数据"""
time_data = defaultdict(list)
for record in self.data:
if not record.timestamp:
continue
try:
dt = datetime.strptime(record.timestamp, '%Y-%m-%d %H:%M:%S')
if group_by == 'day':
key = dt.strftime('%Y-%m-%d')
elif group_by == 'month':
key = dt.strftime('%Y-%m')
elif group_by == 'hour':
key = dt.strftime('%Y-%m-%d %H:00')
else:
key = record.timestamp
time_data[key].append(record.value)
except ValueError:
continue
# 计算每个时间点的统计值
result = {}
for time_key, values in time_data.items():
result[time_key] = {
'数量': len(values),
'总和': sum(values),
'平均值': statistics.mean(values)
}
return dict(sorted(result.items()))
def find_outliers(self, method='iqr') -> List[DataRecord]:
"""查找异常值"""
values = [record.value for record in self.data]
if method == 'iqr':
# 使用四分位距方法
q1 = statistics.quantiles(values, n=4)[0]
q3 = statistics.quantiles(values, n=4)[2]
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = [record for record in self.data
if record.value < lower_bound or record.value > upper_bound]
elif method == 'zscore':
# 使用Z分数方法
mean_val = statistics.mean(values)
std_val = statistics.stdev(values) if len(values) > 1 else 0
if std_val == 0:
return []
outliers = [record for record in self.data
if abs(record.value - mean_val) / std_val > 3]
else:
outliers = []
return outliers
def get_top_records(self, n=10, by='value') -> List[DataRecord]:
"""获取排名前N的记录"""
if by == 'value':
return sorted(self.data, key=lambda x: x.value, reverse=True)[:n]
elif by == 'timestamp':
return sorted(self.data, key=lambda x: x.timestamp, reverse=True)[:n]
else:
return self.data[:n]
def search_records(self, **criteria) -> List[DataRecord]:
"""搜索记录"""
results = self.data
for key, value in criteria.items():
if key == 'category':
results = [r for r in results if r.category == value]
elif key == 'min_value':
results = [r for r in results if r.value >= value]
elif key == 'max_value':
results = [r for r in results if r.value <= value]
elif key == 'date_from':
results = [r for r in results if r.timestamp >= value]
elif key == 'date_to':
results = [r for r in results if r.timestamp <= value]
elif key == 'id_pattern':
pattern = re.compile(value, re.IGNORECASE)
results = [r for r in results if pattern.search(r.id)]
return results
def generate_report(self) -> str:
"""生成分析报告"""
report = []
report.append("=" * 60)
report.append(" 数据分析报告")
report.append("=" * 60)
report.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append("")
# 基本统计
basic_stats = self.get_basic_stats()
if basic_stats:
report.append("基本统计信息:")
report.append(f" 记录总数: {basic_stats['记录总数']:,}")
report.append(f" 类别数量: {basic_stats['类别数量']}")
num_stats = basic_stats['数值统计']
report.append(f" 数值统计:")
report.append(f" 总和: {num_stats['总和']:,.2f}")
report.append(f" 平均值: {num_stats['平均值']:.2f}")
report.append(f" 中位数: {num_stats['中位数']:.2f}")
report.append(f" 最大值: {num_stats['最大值']:.2f}")
report.append(f" 最小值: {num_stats['最小值']:.2f}")
report.append(f" 标准差: {num_stats['标准差']:.2f}")
if basic_stats['日期范围']:
report.append(f" 日期范围: {basic_stats['日期范围'][0]} 到 {basic_stats['日期范围'][1]}")
# 分类统计
category_stats = self.get_category_stats()
if category_stats:
report.append("\n分类统计:")
for category, stats in sorted(category_stats.items(),
key=lambda x: x[1]['总和'], reverse=True):
report.append(f" {category}:")
report.append(f" 数量: {stats['数量']:,}")
report.append(f" 总和: {stats['总和']:,.2f}")
report.append(f" 平均值: {stats['平均值']:.2f}")
# 异常值检测
outliers = self.find_outliers()
if outliers:
report.append(f"\n异常值检测:")
report.append(f" 发现 {len(outliers)} 个异常值")
report.append(f" 异常值占比: {len(outliers)/len(self.data)*100:.1f}%")
if len(outliers) <= 10:
report.append(" 异常值列表:")
for outlier in outliers:
report.append(f" {outlier.id}: {outlier.value:.2f} ({outlier.category})")
# 排名前10
top_records = self.get_top_records(10)
if top_records:
report.append("\n数值排名前10:")
for i, record in enumerate(top_records, 1):
report.append(f" {i:2d}. {record.id}: {record.value:.2f} ({record.category})")
report.append("\n" + "=" * 60)
return "\n".join(report)
def export_summary(self, filename: str) -> bool:
"""导出摘要到文件"""
try:
summary = {
'basic_stats': self.get_basic_stats(),
'category_stats': self.get_category_stats(),
'outliers_count': len(self.find_outliers()),
'top_records': [record._asdict() for record in self.get_top_records(10)],
'export_time': datetime.now().isoformat()
}
with open(filename, 'w', encoding='utf-8') as file:
json.dump(summary, file, ensure_ascii=False, indent=2)
print(f"摘要已导出到: {filename}")
return True
except Exception as e:
print(f"导出失败: {e}")
return False
def main():
"""主函数"""
analyzer = DataAnalyzer()
while True:
print("\n" + "=" * 40)
print(" 数据分析工具")
print("=" * 40)
print("1. 加载示例数据")
print("2. 从CSV加载数据")
print("3. 从JSON加载数据")
print("4. 清洗数据")
print("5. 基本统计")
print("6. 分类统计")
print("7. 时间序列分析")
print("8. 异常值检测")
print("9. 搜索记录")
print("10. 生成报告")
print("11. 导出摘要")
print("12. 退出")
try:
choice = input("\n请选择操作 (1-12): ").strip()
if choice == '1':
analyzer.add_sample_data()
elif choice == '2':
filename = input("请输入CSV文件路径: ").strip()
analyzer.load_from_csv(filename)
elif choice == '3':
filename = input("请输入JSON文件路径: ").strip()
analyzer.load_from_json(filename)
elif choice == '4':
stats = analyzer.clean_data()
print("数据清洗完成:")
for key, value in stats.items():
print(f" {key}: {value:,}")
elif choice == '5':
stats = analyzer.get_basic_stats()
if stats:
print("\n基本统计信息:")
print(f"记录总数: {stats['记录总数']:,}")
print(f"类别数量: {stats['类别数量']}")
num_stats = stats['数值统计']
print(f"平均值: {num_stats['平均值']:.2f}")
print(f"中位数: {num_stats['中位数']:.2f}")
print(f"标准差: {num_stats['标准差']:.2f}")
else:
print("没有数据")
elif choice == '6':
stats = analyzer.get_category_stats()
if stats:
print("\n分类统计:")
for category, data in stats.items():
print(f"{category}: {data['数量']}条记录, 平均值: {data['平均值']:.2f}")
else:
print("没有数据")
elif choice == '7':
group_by = input("分组方式 (day/month/hour): ").strip() or 'day'
time_series = analyzer.get_time_series(group_by)
if time_series:
print(f"\n时间序列分析 (按{group_by}分组):")
for time_key, data in list(time_series.items())[:10]: # 只显示前10个
print(f"{time_key}: {data['数量']}条记录, 平均值: {data['平均值']:.2f}")
if len(time_series) > 10:
print(f"... 还有 {len(time_series) - 10} 个时间点")
else:
print("没有时间数据")
elif choice == '8':
method = input("检测方法 (iqr/zscore): ").strip() or 'iqr'
outliers = analyzer.find_outliers(method)
print(f"\n发现 {len(outliers)} 个异常值:")
for outlier in outliers[:10]: # 只显示前10个
print(f"{outlier.id}: {outlier.value:.2f} ({outlier.category})")
if len(outliers) > 10:
print(f"... 还有 {len(outliers) - 10} 个异常值")
elif choice == '9':
print("搜索条件 (留空跳过):")
criteria = {}
category = input("类别: ").strip()
if category:
criteria['category'] = category
min_value = input("最小值: ").strip()
if min_value:
criteria['min_value'] = float(min_value)
max_value = input("最大值: ").strip()
if max_value:
criteria['max_value'] = float(max_value)
results = analyzer.search_records(**criteria)
print(f"\n找到 {len(results)} 条记录:")
for record in results[:10]: # 只显示前10个
print(f"{record.id}: {record.value:.2f} ({record.category})")
if len(results) > 10:
print(f"... 还有 {len(results) - 10} 条记录")
elif choice == '10':
report = analyzer.generate_report()
print(report)
save = input("\n是否保存报告到文件? (y/n): ").lower().strip()
if save in ['y', 'yes', '是']:
filename = f"data_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
with open(filename, 'w', encoding='utf-8') as f:
f.write(report)
print(f"报告已保存到: {filename}")
elif choice == '11':
filename = f"data_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
analyzer.export_summary(filename)
elif choice == '12':
print("感谢使用数据分析工具!")
break
else:
print("无效的选择,请重新输入")
except ValueError as e:
print(f"输入错误: {e}")
except KeyboardInterrupt:
print("\n\n程序被用户中断")
break
except Exception as e:
print(f"发生错误: {e}")
if __name__ == "__main__":
main()
运行数据分析工具:
python data_analyzer.py
本章小结
本章我们深入学习了Python的数据结构:
- 列表的高级操作:切片、方法、性能优化、高级应用
- 元组和命名元组:不可变性、解包、命名元组的使用
- 字典的高级用法:创建、操作、排序、嵌套字典处理
- 集合的操作和应用:基本操作、集合运算、实际应用场景
- 字符串的深入处理:方法详解、格式化、正则表达式、高级应用
- 性能比较和选择指南:不同数据结构的性能特点和适用场景
- 综合应用:通过数据分析工具整合所学知识
下一章预告
下一章我们将学习《函数和模块》,内容包括: - 函数的定义和调用 - 参数传递和作用域 - 装饰器和闭包 - 模块和包的使用 - 标准库介绍
练习题
基础练习
列表操作:
- 实现列表的快速排序算法
- 编写函数合并两个有序列表
- 实现列表的二分查找
字典应用:
- 实现一个简单的缓存系统
- 编写词频统计程序
- 创建多级字典的安全访问函数
进阶练习
集合运算:
- 实现集合的幂集生成
- 编写图的邻接表表示
- 实现简单的推荐系统
字符串处理:
- 实现简单的模板引擎
- 编写文本相似度计算函数
- 创建日志解析器
综合练习
- 项目实战:
- 扩展数据分析工具,添加更多统计功能
- 实现简单的数据库查询引擎
- 创建文本处理和分析工具
提示:数据结构是程序的基础,选择合适的数据结构能显著提高程序性能。多练习不同场景下的应用,培养数据结构选择的直觉。