Scrapy
Scrapy 是一个强大的 Python 爬虫框架,适合大规模数据抓取。
安装
pip install scrapy
创建项目
# 创建新项目
scrapy startproject myproject
# 创建爬虫
cd myproject
scrapy genspider myspider example.com
# 运行爬虫
scrapy crawl myspider
基本爬虫
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://quotes.toscrape.com/']
def parse(self, response):
# 提取数据
for quote in response.css('.quote'):
yield {
'text': quote.css('.text::text').get(),
'author': quote.css('.author::text').get(),
'tags': quote.css('.tag::text').getall(),
}
# 跟踪下一页链接
next_page = response.css('li.next a::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
Items 定义
import scrapy
class QuoteItem(scrapy.Item):
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
使用 Items
import scrapy
from myproject.items import QuoteItem
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://quotes.toscrape.com/']
def parse(self, response):
for quote in response.css('.quote'):
item = QuoteItem()
item['text'] = quote.css('.text::text').get()
item['author'] = quote.css('.author::text').get()
item['tags'] = quote.css('.tag::text').getall()
yield item
Pipelines 处理
import json
class JsonPipeline:
def open_spider(self, spider):
self.file = open('output.json', 'w', encoding='utf-8')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.file.write(line)
return item
配置 Pipelines
# settings.py
ITEM_PIPELINES = {
'myproject.pipelines.JsonPipeline': 300,
}
中间件
class UserAgentMiddleware:
def process_request(self, request, spider):
request.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
class ProxyMiddleware:
def process_request(self, request, spider):
request.meta['proxy'] = 'http://proxy.example.com:8080'
配置中间件
# settings.py
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.UserAgentMiddleware': 400,
'myproject.middlewares.ProxyMiddleware': 410,
}
常用设置
# settings.py
# 遵守 robots.txt
ROBOTSTXT_OBEY = False
# 下载延迟
DOWNLOAD_DELAY = 2
# 并发请求数
CONCURRENT_REQUESTS = 16
# User-Agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
# Cookies
COOKIES_ENABLED = False
# 重试次数
RETRY_TIMES = 3
# 超时时间
DOWNLOAD_TIMEOUT = 30
导出数据
# 导出为 JSON
scrapy crawl myspider -o output.json
# 导出为 CSV
scrapy crawl myspider -o output.csv
# 导出为 XML
scrapy crawl myspider -o output.xml
运行多个爬虫
from scrapy.crawler import CrawlerProcess
from myproject.spiders.myspider import MySpider
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
})
process.crawl(MySpider)
process.start()
Scrapy 特点:
- 🚀 高性能:异步处理,支持高并发
- 🔧 可扩展:支持中间件、管道等扩展
- 📊 数据提取:支持 CSS 选择器、XPath
- 💾 数据存储:支持多种格式导出
- 🌐 分布式:支持分布式爬取
💡 提示:Scrapy 适合大规模数据抓取,配合 Playwright 可以处理动态网页。