本章目标
通过本章学习,您将能够: - 创建和配置Scrapy项目 - 理解Scrapy项目的目录结构 - 编写第一个Spider - 运行和调试Spider - 处理基本的数据提取
1. 创建Scrapy项目
1.1 使用命令行创建项目
# 创建新的Scrapy项目
scrapy startproject tutorial
# 进入项目目录
cd tutorial
1.2 项目目录结构
tutorial/
scrapy.cfg # 部署配置文件
tutorial/ # 项目的Python模块
__init__.py
items.py # 项目中的item文件
middlewares.py # 项目中的middlewares文件
pipelines.py # 项目中的pipelines文件
settings.py # 项目的设置文件
spiders/ # 放置spider代码的目录
__init__.py
1.3 各文件详解
# 1. scrapy.cfg - 项目配置文件
print("📁 scrapy.cfg 配置文件:")
print("""
[settings]
default = tutorial.settings
[deploy]
#url = http://localhost:6800/
project = tutorial
""")
# 2. settings.py - 项目设置
print("\n⚙️ settings.py 主要配置:")
print("""
BOT_NAME = 'tutorial'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
# 遵守robots.txt规则
ROBOTSTXT_OBEY = True
# 下载延迟设置
DOWNLOAD_DELAY = 3
RANDOMIZE_DOWNLOAD_DELAY = 0.5
# 用户代理设置
USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
# 并发请求数
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 8
# 缓存设置
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
""")
2. 编写第一个Spider
2.1 创建Spider
# 在项目根目录下创建spider
cd tutorial
scrapy genspider quotes quotes.toscrape.com
2.2 Spider基本结构
# tutorial/spiders/quotes.py
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes' # Spider的唯一标识符
allowed_domains = ['quotes.toscrape.com'] # 允许爬取的域名
start_urls = ['http://quotes.toscrape.com/'] # 起始URL列表
def parse(self, response):
"""
默认的回调函数,处理start_urls返回的响应
"""
# 提取页面中的引用
quotes = response.css('div.quote')
for quote in quotes:
yield {
'text': quote.css('span.text::text').get(),
'author': quote.css('small.author::text').get(),
'tags': quote.css('div.tags a.tag::text').getall(),
}
# 跟踪下一页链接
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
print("🕷️ 第一个Spider创建完成!")
2.3 Spider详细解析
# 3. Spider组件详解
print("\n🔍 Spider组件详解:")
class DetailedQuotesSpider(scrapy.Spider):
name = 'detailed_quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
# 自定义设置
custom_settings = {
'DOWNLOAD_DELAY': 1,
'CONCURRENT_REQUESTS': 1,
}
def start_requests(self):
"""
生成初始请求的方法
可以自定义请求头、cookies等
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
for url in self.start_urls:
yield scrapy.Request(
url=url,
headers=headers,
callback=self.parse,
meta={'page_number': 1}
)
def parse(self, response):
"""
解析响应的主要方法
"""
page_number = response.meta.get('page_number', 1)
self.logger.info(f'正在解析第 {page_number} 页')
# 提取引用数据
quotes = response.css('div.quote')
for quote in quotes:
# 提取基本信息
quote_data = {
'text': quote.css('span.text::text').get(),
'author': quote.css('small.author::text').get(),
'tags': quote.css('div.tags a.tag::text').getall(),
'page': page_number,
}
# 获取作者详情页链接
author_url = quote.css('small.author ~ a::attr(href)').get()
if author_url:
author_url = response.urljoin(author_url)
# 传递quote_data到作者页面
yield scrapy.Request(
url=author_url,
callback=self.parse_author,
meta={'quote_data': quote_data}
)
else:
yield quote_data
# 处理分页
next_page = response.css('li.next a::attr(href)').get()
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(
url=next_page,
callback=self.parse,
meta={'page_number': page_number + 1}
)
def parse_author(self, response):
"""
解析作者详情页
"""
quote_data = response.meta['quote_data']
# 提取作者详细信息
author_info = {
'name': response.css('h3.author-title::text').get(),
'birth_date': response.css('span.author-born-date::text').get(),
'birth_location': response.css('span.author-born-location::text').get(),
'description': response.css('div.author-description::text').get(),
}
# 合并数据
quote_data['author_info'] = author_info
yield quote_data
print("Spider组件详解完成!")
3. 运行Spider
3.1 基本运行命令
# 4. 运行Spider的方法
print("\n🚀 运行Spider:")
# 命令行运行方式
commands = [
"# 基本运行",
"scrapy crawl quotes",
"",
"# 保存到文件",
"scrapy crawl quotes -o quotes.json",
"scrapy crawl quotes -o quotes.csv",
"scrapy crawl quotes -o quotes.xml",
"",
"# 设置日志级别",
"scrapy crawl quotes -L INFO",
"scrapy crawl quotes -L DEBUG",
"",
"# 自定义设置",
"scrapy crawl quotes -s DOWNLOAD_DELAY=5",
"scrapy crawl quotes -s CONCURRENT_REQUESTS=1",
"",
"# 传递参数给Spider",
"scrapy crawl quotes -a category=inspirational",
"scrapy crawl quotes -a start_page=2",
]
for cmd in commands:
print(cmd)
3.2 在Python脚本中运行
# 5. 在Python脚本中运行Spider
print("\n🐍 在Python脚本中运行:")
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def run_spider_in_script():
"""
在Python脚本中运行Spider
"""
# 获取项目设置
settings = get_project_settings()
# 自定义设置
settings.set('FEEDS', {
'quotes.json': {
'format': 'json',
'encoding': 'utf8',
'store_empty': False,
'fields': None,
'indent': 4,
},
})
# 创建爬虫进程
process = CrawlerProcess(settings)
# 添加Spider
process.crawl('quotes')
# 启动爬虫
process.start()
# 示例:带参数运行
def run_spider_with_args():
"""
带参数运行Spider
"""
settings = get_project_settings()
process = CrawlerProcess(settings)
# 传递参数给Spider
process.crawl('quotes', category='love', max_pages=3)
process.start()
print("Python脚本运行方法定义完成!")
4. 数据提取技术
4.1 CSS选择器
# 6. CSS选择器详解
print("\n🎯 CSS选择器技术:")
def css_selector_examples():
"""
CSS选择器使用示例
"""
examples = {
"基本选择器": {
"元素选择器": "div",
"类选择器": ".quote",
"ID选择器": "#quote-1",
"属性选择器": "a[href]",
},
"组合选择器": {
"后代选择器": "div.quote span.text",
"子选择器": "div.quote > span.text",
"相邻兄弟": "small.author + a",
"通用兄弟": "small.author ~ a",
},
"伪类选择器": {
"第一个元素": "div.quote:first-child",
"最后一个元素": "div.quote:last-child",
"第n个元素": "div.quote:nth-child(2)",
"包含文本": "a:contains('Next')",
},
"属性提取": {
"文本内容": "span.text::text",
"属性值": "a::attr(href)",
"所有文本": "div.tags::text",
}
}
for category, selectors in examples.items():
print(f"\n{category}:")
for name, selector in selectors.items():
print(f" {name}: {selector}")
css_selector_examples()
4.2 XPath选择器
# 7. XPath选择器详解
print("\n🗺️ XPath选择器技术:")
def xpath_selector_examples():
"""
XPath选择器使用示例
"""
examples = {
"基本语法": {
"选择根节点": "/",
"选择任意位置": "//",
"选择当前节点": ".",
"选择父节点": "..",
"选择属性": "@href",
},
"路径表达式": {
"绝对路径": "/html/body/div",
"相对路径": "//div[@class='quote']",
"选择文本": "//span[@class='text']/text()",
"选择属性": "//a/@href",
},
"谓语表达式": {
"按位置": "//div[1]",
"按属性": "//div[@class='quote']",
"按文本": "//a[text()='Next']",
"按包含": "//a[contains(@href, 'page')]",
},
"轴表达式": {
"子节点": "//div/child::span",
"父节点": "//span/parent::div",
"兄弟节点": "//span/following-sibling::a",
"祖先节点": "//span/ancestor::div",
}
}
for category, selectors in examples.items():
print(f"\n{category}:")
for name, selector in selectors.items():
print(f" {name}: {selector}")
xpath_selector_examples()
4.3 数据清洗和处理
# 8. 数据清洗和处理
print("\n🧹 数据清洗技术:")
import re
from w3lib.html import remove_tags
class DataProcessor:
"""
数据处理工具类
"""
@staticmethod
def clean_text(text):
"""
清洗文本数据
"""
if not text:
return ""
# 移除HTML标签
text = remove_tags(text)
# 移除多余空白
text = re.sub(r'\s+', ' ', text)
# 去除首尾空白
text = text.strip()
return text
@staticmethod
def extract_numbers(text):
"""
提取文本中的数字
"""
if not text:
return []
numbers = re.findall(r'\d+\.?\d*', text)
return [float(num) for num in numbers]
@staticmethod
def normalize_url(url, base_url):
"""
标准化URL
"""
from urllib.parse import urljoin, urlparse
# 处理相对URL
full_url = urljoin(base_url, url)
# 解析URL组件
parsed = urlparse(full_url)
# 移除fragment
normalized = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
if parsed.query:
normalized += f"?{parsed.query}"
return normalized
@staticmethod
def extract_emails(text):
"""
提取邮箱地址
"""
if not text:
return []
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
return re.findall(email_pattern, text)
# 使用示例
processor = DataProcessor()
# 测试数据清洗
sample_text = " <p>Hello World!</p> \n\t "
cleaned = processor.clean_text(sample_text)
print(f"原文: '{sample_text}'")
print(f"清洗后: '{cleaned}'")
# 测试数字提取
number_text = "价格:$29.99,折扣:15%"
numbers = processor.extract_numbers(number_text)
print(f"提取的数字: {numbers}")
5. 调试和测试
5.1 Scrapy Shell
# 9. 使用Scrapy Shell调试
print("\n🐚 Scrapy Shell调试:")
shell_commands = [
"# 启动Scrapy Shell",
"scrapy shell 'http://quotes.toscrape.com'",
"",
"# 在Shell中测试选择器",
"response.css('div.quote').getall()[:2]",
"response.css('span.text::text').getall()[:3]",
"response.xpath('//span[@class=\"text\"]/text()').getall()[:3]",
"",
"# 测试链接提取",
"response.css('li.next a::attr(href)').get()",
"response.urljoin(response.css('li.next a::attr(href)').get())",
"",
"# 查看响应信息",
"response.status",
"response.headers",
"len(response.body)",
"",
"# 测试新请求",
"fetch('http://quotes.toscrape.com/page/2/')",
"response.css('div.quote').getall()[:1]",
]
for cmd in shell_commands:
print(cmd)
5.2 日志和调试
# 10. 日志和调试技术
print("\n📝 日志和调试:")
import logging
class DebuggingSpider(scrapy.Spider):
name = 'debugging_spider'
start_urls = ['http://quotes.toscrape.com/']
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# 设置日志级别
logging.getLogger('scrapy').setLevel(logging.INFO)
def parse(self, response):
# 记录基本信息
self.logger.info(f'解析页面: {response.url}')
self.logger.info(f'状态码: {response.status}')
self.logger.info(f'页面大小: {len(response.body)} bytes')
quotes = response.css('div.quote')
self.logger.info(f'找到 {len(quotes)} 个引用')
for i, quote in enumerate(quotes):
text = quote.css('span.text::text').get()
author = quote.css('small.author::text').get()
tags = quote.css('div.tags a.tag::text').getall()
# 调试信息
self.logger.debug(f'引用 {i+1}: {text[:50]}...')
# 数据验证
if not text or not author:
self.logger.warning(f'引用 {i+1} 数据不完整')
continue
yield {
'text': text,
'author': author,
'tags': tags,
'url': response.url,
}
# 分页处理
next_page = response.css('li.next a::attr(href)').get()
if next_page:
next_url = response.urljoin(next_page)
self.logger.info(f'跟踪下一页: {next_url}')
yield scrapy.Request(next_url, callback=self.parse)
else:
self.logger.info('已到达最后一页')
print("调试Spider定义完成!")
5.3 单元测试
# 11. Spider单元测试
print("\n🧪 Spider单元测试:")
import unittest
from scrapy.http import HtmlResponse, Request
class TestQuotesSpider(unittest.TestCase):
"""
Spider单元测试类
"""
def setUp(self):
"""
测试前准备
"""
self.spider = QuotesSpider()
def _get_response(self, file_name, url='http://quotes.toscrape.com/'):
"""
创建测试响应对象
"""
# 模拟HTML内容
html_content = '''
<div class="quote">
<span class="text">"The world as we have created it is a process of our thinking."</span>
<small class="author">Albert Einstein</small>
<div class="tags">
<a class="tag" href="/tag/change/">change</a>
<a class="tag" href="/tag/deep-thoughts/">deep-thoughts</a>
</div>
</div>
<li class="next">
<a href="/page/2/">Next <span aria-hidden="true">→</span></a>
</li>
'''
request = Request(url=url)
response = HtmlResponse(
url=url,
request=request,
body=html_content.encode('utf-8'),
encoding='utf-8'
)
return response
def test_parse_quotes(self):
"""
测试引用解析
"""
response = self._get_response('quotes.html')
results = list(self.spider.parse(response))
# 检查结果数量
quotes = [r for r in results if 'text' in r]
self.assertEqual(len(quotes), 1)
# 检查数据结构
quote = quotes[0]
self.assertIn('text', quote)
self.assertIn('author', quote)
self.assertIn('tags', quote)
# 检查数据内容
self.assertEqual(quote['author'], 'Albert Einstein')
self.assertIn('change', quote['tags'])
def test_next_page_extraction(self):
"""
测试下一页链接提取
"""
response = self._get_response('quotes.html')
results = list(self.spider.parse(response))
# 检查是否有下一页请求
requests = [r for r in results if isinstance(r, Request)]
self.assertEqual(len(requests), 1)
# 检查URL
next_request = requests[0]
self.assertEqual(next_request.url, 'http://quotes.toscrape.com/page/2/')
# 运行测试
if __name__ == '__main__':
unittest.main()
print("单元测试定义完成!")
6. 实际项目示例
6.1 新闻爬虫
# 12. 实际项目:新闻爬虫
print("\n📰 新闻爬虫示例:")
class NewsSpider(scrapy.Spider):
name = 'news'
allowed_domains = ['example-news.com']
start_urls = ['http://example-news.com/']
def parse(self, response):
"""
解析新闻列表页
"""
# 提取新闻链接
news_links = response.css('article.news-item a::attr(href)').getall()
for link in news_links:
news_url = response.urljoin(link)
yield scrapy.Request(
url=news_url,
callback=self.parse_news,
meta={'category': self.extract_category(response.url)}
)
# 分页处理
next_page = response.css('a.next-page::attr(href)').get()
if next_page:
yield scrapy.Request(
url=response.urljoin(next_page),
callback=self.parse
)
def parse_news(self, response):
"""
解析新闻详情页
"""
# 提取新闻数据
title = response.css('h1.news-title::text').get()
content = response.css('div.news-content::text').getall()
author = response.css('span.author::text').get()
publish_date = response.css('time.publish-date::attr(datetime)').get()
tags = response.css('div.tags a.tag::text').getall()
# 数据清洗
title = self.clean_text(title)
content = ' '.join([self.clean_text(p) for p in content])
yield {
'title': title,
'content': content,
'author': author,
'publish_date': publish_date,
'tags': tags,
'category': response.meta.get('category'),
'url': response.url,
'scraped_at': self.get_current_time(),
}
def extract_category(self, url):
"""
从URL提取分类
"""
import re
match = re.search(r'/category/([^/]+)/', url)
return match.group(1) if match else 'general'
def clean_text(self, text):
"""
清洗文本
"""
if not text:
return ""
return re.sub(r'\s+', ' ', text.strip())
def get_current_time(self):
"""
获取当前时间
"""
from datetime import datetime
return datetime.now().isoformat()
print("新闻爬虫示例完成!")
6.2 电商爬虫
# 13. 实际项目:电商爬虫
print("\n🛒 电商爬虫示例:")
class EcommerceSpider(scrapy.Spider):
name = 'ecommerce'
allowed_domains = ['example-shop.com']
def start_requests(self):
"""
生成初始请求
"""
categories = [
'electronics',
'clothing',
'books',
'home-garden'
]
for category in categories:
url = f'http://example-shop.com/category/{category}'
yield scrapy.Request(
url=url,
callback=self.parse_category,
meta={'category': category}
)
def parse_category(self, response):
"""
解析分类页面
"""
category = response.meta['category']
# 提取产品链接
product_links = response.css('div.product-item a::attr(href)').getall()
for link in product_links:
product_url = response.urljoin(link)
yield scrapy.Request(
url=product_url,
callback=self.parse_product,
meta={'category': category}
)
# 分页处理
next_page = response.css('a.pagination-next::attr(href)').get()
if next_page:
yield scrapy.Request(
url=response.urljoin(next_page),
callback=self.parse_category,
meta={'category': category}
)
def parse_product(self, response):
"""
解析产品详情页
"""
# 基本信息
name = response.css('h1.product-name::text').get()
price = response.css('span.price::text').get()
description = response.css('div.description::text').getall()
# 评分和评论
rating = response.css('div.rating span.score::text').get()
review_count = response.css('div.rating span.count::text').get()
# 库存信息
stock_status = response.css('span.stock-status::text').get()
# 图片链接
images = response.css('div.product-images img::attr(src)').getall()
# 规格信息
specifications = {}
spec_rows = response.css('table.specifications tr')
for row in spec_rows:
key = row.css('td.spec-key::text').get()
value = row.css('td.spec-value::text').get()
if key and value:
specifications[key.strip()] = value.strip()
# 数据处理
price_value = self.extract_price(price)
rating_value = self.extract_rating(rating)
review_count_value = self.extract_number(review_count)
yield {
'name': name,
'price': price_value,
'original_price_text': price,
'description': ' '.join(description),
'rating': rating_value,
'review_count': review_count_value,
'stock_status': stock_status,
'images': [response.urljoin(img) for img in images],
'specifications': specifications,
'category': response.meta.get('category'),
'url': response.url,
'scraped_at': self.get_current_time(),
}
def extract_price(self, price_text):
"""
提取价格数值
"""
if not price_text:
return None
# 移除货币符号和其他字符,提取数字
import re
match = re.search(r'[\d,]+\.?\d*', price_text.replace(',', ''))
return float(match.group()) if match else None
def extract_rating(self, rating_text):
"""
提取评分数值
"""
if not rating_text:
return None
import re
match = re.search(r'\d+\.?\d*', rating_text)
return float(match.group()) if match else None
def extract_number(self, text):
"""
提取数字
"""
if not text:
return None
import re
match = re.search(r'\d+', text.replace(',', ''))
return int(match.group()) if match else None
def get_current_time(self):
"""
获取当前时间
"""
from datetime import datetime
return datetime.now().isoformat()
print("电商爬虫示例完成!")
7. 本章小结
7.1 核心知识点
项目创建和结构
- Scrapy项目的创建方法
- 项目目录结构和文件作用
- 配置文件的使用
Spider开发
- Spider的基本结构和组件
- 数据提取和处理方法
- 请求和响应处理
数据提取技术
- CSS选择器的使用
- XPath选择器的应用
- 数据清洗和验证
调试和测试
- Scrapy Shell的使用
- 日志记录和调试技巧
- 单元测试方法
7.2 最佳实践
- 🏗️ 项目结构: 保持清晰的项目结构和命名规范
- 🎯 选择器优化: 选择稳定、高效的CSS/XPath选择器
- 🧹 数据清洗: 及时清洗和验证提取的数据
- 🐛 调试技巧: 充分利用Scrapy Shell进行调试
7.3 常见陷阱
- ❌ 选择器脆弱: 过度依赖页面结构,缺乏容错性
- ❌ 数据质量: 忽略数据验证和清洗
- ❌ 调试不足: 没有充分测试Spider的各种情况
- ❌ 硬编码: 在Spider中硬编码URL和参数
7.4 下一步学习
- 🔧 学习Items和Pipelines的使用
- 🌐 掌握处理JavaScript渲染的页面
- 🛡️ 了解反爬虫对策和应对方法
- 📊 学习数据存储和处理技术
8. 练习题
8.1 基础练习
创建简单Spider
- 创建一个爬取书籍信息的Spider
- 提取书名、作者、价格等信息
- 处理分页
数据提取练习
- 使用CSS选择器提取复杂结构数据
- 使用XPath处理特殊情况
- 实现数据清洗功能
调试技能
- 使用Scrapy Shell调试选择器
- 添加日志记录
- 编写单元测试
8.2 进阶练习
多页面爬虫
- 实现列表页和详情页的联合爬取
- 处理复杂的分页逻辑
- 传递数据between页面
数据验证
- 实现完整的数据验证机制
- 处理缺失和异常数据
- 添加数据质量检查
性能优化
- 优化选择器性能
- 减少不必要的请求
- 实现智能重试机制
8.3 挑战练习
复杂网站爬取
- 爬取需要登录的网站
- 处理AJAX加载的内容
- 应对简单的反爬虫措施
大规模爬虫
- 设计可扩展的爬虫架构
- 实现分布式爬取
- 添加监控和报警功能
实时爬虫
- 实现增量爬取
- 监控网站更新
- 设计实时数据处理流程
恭喜您完成第2章的学习! 🎉
您已经掌握了Scrapy项目的创建、Spider的编写和基本的数据提取技术。在下一章中,我们将深入学习Spider的高级功能和技巧。