爬虫技术 - Python 入门教程

BeautifulSoup

BeautifulSoup 是一个 Python 库，用于从 HTML 或 XML 文件中提取数据，非常适合网页爬虫。

安装

pip install beautifulsoup4
pip install lxml  # 推荐使用 lxml 解析器

基本使用

from bs4 import BeautifulSoup

html = """

    
        测试页面
    
    
        欢迎来到我的网站
        
            这是一个段落
            这是另一个段落
            链接
            
                项目1
                项目2
                项目3
            
        
    

"""

# 创建 BeautifulSoup 对象
soup = BeautifulSoup(html, 'lxml')

# 格式化输出
print(soup.prettify())

# 获取标题
print(f"标题: {soup.title.string}")

# 获取第一个段落
print(f"第一个段落: {soup.p.string}")

# 通过 ID 查找
print(f"ID为intro的段落: {soup.find(id='intro').string}")

查找元素

from bs4 import BeautifulSoup

html = """

    标题
    段落1
    段落2
    链接1
    链接2

"""

soup = BeautifulSoup(html, 'lxml')

# find() 查找第一个匹配的元素
h2 = soup.find('h2')
print(f"h2: {h2.string}")

# find_all() 查找所有匹配的元素
paragraphs = soup.find_all('p', class_='text')
for p in paragraphs:
    print(f"段落: {p.string}")

# 查找所有链接
links = soup.find_all('a')
for link in links:
    print(f"链接文本: {link.string}, URL: {link['href']}")

# CSS 选择器
container = soup.select_one('.container')
texts = soup.select('.text')
print(f"选择器结果: {len(texts)} 个段落")

获取属性和文本

from bs4 import BeautifulSoup

html = """
点击这里

    姓名: 张三
    年龄: 25

"""

soup = BeautifulSoup(html, 'lxml')

# 获取属性
link = soup.find('a')
print(f"href: {link['href']}")
print(f"class: {link.get('class')}")
print(f"id: {link.get('id')}")

# 获取文本
print(f"链接文本: {link.string}")
print(f"链接文本: {link.get_text()}")

# 获取所有文本
div = soup.find('div', class_='info')
print(f"所有文本: {div.get_text(strip=True)}")

遍历文档树

from bs4 import BeautifulSoup

html = """

    段落1
    段落2
    
        嵌套内容
    

"""

soup = BeautifulSoup(html, 'lxml')

# 父元素
p = soup.find('p')
print(f"父元素: {p.parent.name}")

# 子元素
div = soup.find('div')
for child in div.children:
    if child.name:
        print(f"子元素: {child.name}")

# 后兄弟元素
first_p = soup.find('p')
next_sibling = first_p.next_sibling
print(f"下一个兄弟: {next_sibling}")

# 所有兄弟元素
for sibling in first_p.next_siblings:
    if sibling.name:
        print(f"兄弟元素: {sibling.name}")

实际爬虫示例

import requests
from bs4 import BeautifulSoup

def scrape_quotes():
    url = 'https://quotes.toscrape.com/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')

    # 查找所有名言
    quotes = soup.find_all('div', class_='quote')

    for quote in quotes:
        text = quote.find('span', class_='text').string
        author = quote.find('small', class_='author').string
        tags = [tag.string for tag in quote.find_all('a', class_='tag')]

        print(f"名言: {text}")
        print(f"作者: {author}")
        print(f"标签: {', '.join(tags)}")
        print("-" * 50)

scrape_quotes()

💡 提示：BeautifulSoup 适合解析静态网页，对于动态网页需要配合 Selenium 或 Playwright 使用。

Selenium

Selenium 是一个自动化测试工具，可以模拟浏览器操作，适合爬取动态网页。

安装

pip install selenium

基本使用

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

# 创建浏览器驱动（需要先下载对应浏览器的驱动）
driver = webdriver.Chrome()  # 或 webdriver.Firefox()

try:
    # 打开网页
    driver.get('https://www.baidu.com')

    # 查找元素
    search_box = driver.find_element(By.ID, 'kw')

    # 输入搜索内容
    search_box.send_keys('Python 爬虫')

    # 提交搜索
    search_box.send_keys(Keys.RETURN)

    # 等待页面加载
    time.sleep(2)

    # 获取搜索结果
    results = driver.find_elements(By.CSS_SELECTOR, '.result')
    for result in results[:5]:
        print(result.text)

finally:
    # 关闭浏览器
    driver.quit()

元素定位

from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get('https://example.com')

# 通过 ID
element = driver.find_element(By.ID, 'element_id')

# 通过 Name
element = driver.find_element(By.NAME, 'element_name')

# 通过 Class Name
element = driver.find_element(By.CLASS_NAME, 'element_class')

# 通过 Tag Name
element = driver.find_element(By.TAG_NAME, 'div')

# 通过 CSS Selector
element = driver.find_element(By.CSS_SELECTOR, '.class > div')

# 通过 XPath
element = driver.find_element(By.XPATH, '//div[@class="example"]')

# 通过 Link Text
element = driver.find_element(By.LINK_TEXT, '点击这里')

# 通过 Partial Link Text
element = driver.find_element(By.PARTIAL_LINK_TEXT, '点击')

driver.quit()

元素操作

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

driver = webdriver.Chrome()
driver.get('https://example.com')

# 点击按钮
button = driver.find_element(By.ID, 'submit')
button.click()

# 输入文本
input_field = driver.find_element(By.ID, 'username')
input_field.send_keys('张三')

# 清除输入
input_field.clear()

# 获取文本
text = element.text

# 获取属性
href = element.get_attribute('href')

# 下拉框选择
select = Select(driver.find_element(By.ID, 'country'))
select.select_by_visible_text('中国')
select.select_by_value('CN')
select.select_by_index(0)

# 滚动页面
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

driver.quit()

等待机制

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Chrome()
driver.get('https://example.com')

# 显式等待
try:
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, 'dynamic-element'))
    )
    print("元素已加载")
except:
    print("等待超时")

# 等待元素可点击
element = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.ID, 'button'))
)

# 等待文本出现
WebDriverWait(driver, 10).until(
    EC.text_to_be_present_in_element((By.ID, 'status'), '完成')
)

driver.quit()

处理弹窗和 iframe

from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get('https://example.com')

# 处理 Alert 弹窗
driver.find_element(By.ID, 'show-alert').click()
alert = driver.switch_to.alert
print(f"弹窗文本: {alert.text}")
alert.accept()  # 点击确定
# alert.dismiss()  # 点击取消

# 处理 Confirm 弹窗
driver.find_element(By.ID, 'show-confirm').click()
confirm = driver.switch_to.alert
confirm.dismiss()

# 处理 Prompt 弹窗
driver.find_element(By.ID, 'show-prompt').click()
prompt = driver.switch_to.alert
prompt.send_keys('输入内容')
prompt.accept()

# 切换到 iframe
driver.switch_to.frame('iframe_id')
# 在 iframe 中操作
driver.switch_to.default_content()  # 切换回主页面

driver.quit()

无头模式

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# 配置无头模式
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

driver = webdriver.Chrome(options=chrome_options)
driver.get('https://example.com')
print(driver.title)
driver.quit()

💡 提示：Selenium 适合需要 JavaScript 渲染的动态网页，但速度较慢，资源占用大。

Playwright

Playwright 是微软开发的现代浏览器自动化工具，比 Selenium 更快、更稳定、功能更强大。

安装

pip install playwright
playwright install  # 安装浏览器驱动

基本使用

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    # 启动浏览器
    browser = p.chromium.launch(headless=False)
    page = browser.new_page()

    # 访问网页
    page.goto('https://example.com')

    # 获取标题
    print(f"标题: {page.title()}")

    # 截图
    page.screenshot(path='example.png')

    # 关闭浏览器
    browser.close()

元素定位和操作

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch(headless=False)
    page = browser.new_page()
    page.goto('https://example.com')

    # 点击元素
    page.click('button#submit')

    # 输入文本
    page.fill('input#username', '张三')

    # 获取文本
    text = page.text_content('div.content')

    # 获取属性
    href = page.get_attribute('a.link', 'href')

    # 等待元素
    page.wait_for_selector('.dynamic-content')

    # 等待导航
    page.wait_for_load_state('networkidle')

    # 执行 JavaScript
    result = page.evaluate('() => document.title')

    browser.close()

表单操作

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch(headless=False)
    page = browser.new_page()
    page.goto('https://example.com/login')

    # 填写表单
    page.fill('input[name="username"]', 'admin')
    page.fill('input[name="password"]', 'password123')

    # 选择下拉框
    page.select_option('select#country', 'China')

    # 勾选复选框
    page.check('input[type="checkbox"]')

    # 点击单选按钮
    page.click('input[type="radio"][value="male"]')

    # 提交表单
    page.click('button[type="submit"]')

    # 等待跳转
    page.wait_for_url('**/dashboard')

    browser.close()

网络拦截

from playwright.sync_api import sync_playwright

def handle_route(route):
    # 修改请求
    headers = route.request.headers
    headers['X-Custom-Header'] = 'test'
    route.continue_(headers=headers)

with sync_playwright() as p:
    browser = p.chromium.launch(headless=False)
    page = browser.new_page()

    # 拦截请求
    page.route('**/*.{png,jpg,jpeg}', lambda route: route.abort())
    page.route('**/api/**', handle_route)

    # 监听响应
    def handle_response(response):
        if 'api' in response.url:
            print(f"API 响应: {response.status}")

    page.on('response', handle_response)

    page.goto('https://example.com')
    browser.close()

多页面和上下文

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch(headless=False)

    # 创建多个上下文（类似隐身模式）
    context1 = browser.new_context()
    context2 = browser.new_context()

    # 在不同上下文中创建页面
    page1 = context1.new_page()
    page2 = context2.new_page()

    page1.goto('https://example.com')
    page2.goto('https://test.com')

    # 处理新标签页
    with page.expect_popup() as popup_info:
        page.click('a[target="_blank"]')
    new_page = popup_info.value

    browser.close()

异步模式

import asyncio
from playwright.async_api import async_playwright

async def main():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        await page.goto('https://example.com')
        print(f"标题: {await page.title()}")

        await browser.close()

asyncio.run(main())

爬虫示例

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch(headless=True)
    page = browser.new_page()

    # 访问网页
    page.goto('https://quotes.toscrape.com/js/')

    # 等待内容加载
    page.wait_for_selector('.quote')

    # 获取所有名言
    quotes = page.locator('.quote')
    count = quotes.count()

    for i in range(count):
        text = quotes.nth(i).locator('.text').text_content()
        author = quotes.nth(i).locator('.author').text_content()
        print(f"{author}: {text}")

    browser.close()

Playwright 优势：

⚡ 更快：比 Selenium 速度快很多
🎯 更稳定：内置智能等待机制
🌐 多浏览器：支持 Chromium、Firefox、WebKit
📱 移动端：支持移动设备模拟
🔧 强大的 API：支持网络拦截、表单处理等

💡 提示：Playwright 是现代 Web 自动化的最佳选择，推荐优先使用。

Scrapy

Scrapy 是一个强大的 Python 爬虫框架，适合大规模数据抓取。

安装

pip install scrapy

创建项目

# 创建新项目
scrapy startproject myproject

# 创建爬虫
cd myproject
scrapy genspider myspider example.com

# 运行爬虫
scrapy crawl myspider

基本爬虫

import scrapy

class MySpider(scrapy.Spider):
    name = 'myspider'
    start_urls = ['https://quotes.toscrape.com/']

    def parse(self, response):
        # 提取数据
        for quote in response.css('.quote'):
            yield {
                'text': quote.css('.text::text').get(),
                'author': quote.css('.author::text').get(),
                'tags': quote.css('.tag::text').getall(),
            }

        # 跟踪下一页链接
        next_page = response.css('li.next a::attr(href)').get()
        if next_page:
            yield response.follow(next_page, callback=self.parse)

Items 定义

import scrapy

class QuoteItem(scrapy.Item):
    text = scrapy.Field()
    author = scrapy.Field()
    tags = scrapy.Field()

使用 Items

import scrapy
from myproject.items import QuoteItem

class MySpider(scrapy.Spider):
    name = 'myspider'
    start_urls = ['https://quotes.toscrape.com/']

    def parse(self, response):
        for quote in response.css('.quote'):
            item = QuoteItem()
            item['text'] = quote.css('.text::text').get()
            item['author'] = quote.css('.author::text').get()
            item['tags'] = quote.css('.tag::text').getall()
            yield item

Pipelines 处理

import json

class JsonPipeline:
    def open_spider(self, spider):
        self.file = open('output.json', 'w', encoding='utf-8')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.file.write(line)
        return item

配置 Pipelines

# settings.py
ITEM_PIPELINES = {
    'myproject.pipelines.JsonPipeline': 300,
}

中间件

class UserAgentMiddleware:
    def process_request(self, request, spider):
        request.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'

class ProxyMiddleware:
    def process_request(self, request, spider):
        request.meta['proxy'] = 'http://proxy.example.com:8080'

配置中间件

# settings.py
DOWNLOADER_MIDDLEWARES = {
    'myproject.middlewares.UserAgentMiddleware': 400,
    'myproject.middlewares.ProxyMiddleware': 410,
}

常用设置

# settings.py

# 遵守 robots.txt
ROBOTSTXT_OBEY = False

# 下载延迟
DOWNLOAD_DELAY = 2

# 并发请求数
CONCURRENT_REQUESTS = 16

# User-Agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'

# Cookies
COOKIES_ENABLED = False

# 重试次数
RETRY_TIMES = 3

# 超时时间
DOWNLOAD_TIMEOUT = 30

导出数据

# 导出为 JSON
scrapy crawl myspider -o output.json

# 导出为 CSV
scrapy crawl myspider -o output.csv

# 导出为 XML
scrapy crawl myspider -o output.xml

运行多个爬虫

from scrapy.crawler import CrawlerProcess
from myproject.spiders.myspider import MySpider

process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
})

process.crawl(MySpider)
process.start()

Scrapy 特点：

🚀 高性能：异步处理，支持高并发
🔧 可扩展：支持中间件、管道等扩展
📊 数据提取：支持 CSS 选择器、XPath
💾 数据存储：支持多种格式导出
🌐 分布式：支持分布式爬取

💡 提示：Scrapy 适合大规模数据抓取，配合 Playwright 可以处理动态网页。

正则表达式

正则表达式是一种强大的文本匹配工具，Python 通过 re 模块提供支持。

基本匹配

import re

# 匹配字符串
text = "Hello, Python 3.10!"

# 查找第一个匹配
match = re.search(r'Python', text)
if match:
    print(f"找到匹配: {match.group()}")  # Python

# 查找所有匹配
matches = re.findall(r'\d+', text)
print(f"所有数字: {matches}")  # ['3', '10']

# 替换
new_text = re.sub(r'Python', 'Java', text)
print(f"替换后: {new_text}")  # Hello, Java 3.10!

# 分割
parts = re.split(r'[,\s]+', text)
print(f"分割结果: {parts}")  # ['Hello', 'Python', '3.10!']

常用模式

import re

# 匹配邮箱
email = "user@example.com"
pattern = r'[\w\.-]+@[\w\.-]+\.\w+'
if re.match(pattern, email):
    print("有效的邮箱地址")

# 匹配手机号
phone = "13812345678"
pattern = r'1[3-9]\d{9}'
if re.match(pattern, phone):
    print("有效的手机号")

# 匹配 URL
url = "https://www.example.com/path"
pattern = r'https?://[\w\.-]+/\S*'
if re.match(pattern, url):
    print("有效的 URL")

# 匹配日期
date = "2026-01-25"
pattern = r'\d{4}-\d{2}-\d{2}'
if re.match(pattern, date):
    print("有效的日期格式")

分组和捕获

import re

# 分组匹配
text = "张三: 25岁, 李四: 30岁"
pattern = r'(\w+): (\d+)岁'
matches = re.findall(pattern, text)
print(matches)  # [('张三', '25'), ('李四', '30')]

# 命名分组
text = "2026-01-25"
pattern = r'(?P\d{4})-(?P\d{2})-(?P\d{2})'
match = re.match(pattern, text)
if match:
    print(f"年份: {match.group('year')}")
    print(f"月份: {match.group('month')}")
    print(f"日期: {match.group('day')}")

# 非捕获分组
text = "apple, banana, orange"
pattern = r'(?:apple|banana), (\w+)'
match = re.search(pattern, text)
if match:
    print(f"捕获: {match.group(1)}")  # orange

预编译正则表达式

import re

# 预编译（性能更好）
pattern = re.compile(r'\d+')

# 使用预编译的模式
text = "Python 3.10 is awesome"
matches = pattern.findall(text)
print(matches)  # ['3', '10']

# 多次使用
for text in ["Python 3.8", "Python 3.9", "Python 3.10"]:
    version = pattern.findall(text)
    print(f"{text}: {version}")

常用正则表达式符号

# . 匹配任意字符（除换行符）
re.search(r'P.y', 'Pay')  # 匹配

# ^ 匹配字符串开头
re.search(r'^Hello', 'Hello World')  # 匹配

# $ 匹配字符串结尾
re.search(r'World, 'Hello World')  # 匹配

# * 匹配 0 次或多次
re.search(r'Py*', 'Pyyyyython')  # 匹配

# + 匹配 1 次或多次
re.search(r'Py+', 'Pyyyyython')  # 匹配

# ? 匹配 0 次或 1 次
re.search(r'Py?', 'Pthon')  # 匹配

# {n} 匹配 n 次
re.search(r'\d{3}', '12345')  # 匹配 '123'

# {n,m} 匹配 n 到 m 次
re.search(r'\d{1,3}', '12345')  # 匹配 '123'

# [] 字符集
re.search(r'[Pp]ython', 'Python')  # 匹配

# | 或
re.search(r'Python|Java', 'Python')  # 匹配

# \d 数字, \w 单词字符, \s 空白字符
re.search(r'\d+', 'abc123')  # 匹配 '123'

💡 提示：在正则表达式前加 r 前缀（原始字符串）可以避免转义字符的问题。