Selenium →

BeautifulSoup

BeautifulSoup 是一个 Python 库,用于从 HTML 或 XML 文件中提取数据,非常适合网页爬虫。

安装

pip install beautifulsoup4
pip install lxml  # 推荐使用 lxml 解析器

基本使用

from bs4 import BeautifulSoup

html = """

    
        测试页面
            /* 二级导航样式 */
        .nav-category {
            margin-bottom: 10px;
        }

        .nav-category > h3 {
            color: #764ba2;
            font-size: 1em;
            margin-bottom: 8px;
            padding-left: 8px;
            border-left: 3px solid #667eea;
            cursor: pointer;
            display: flex;
            align-items: center;
            transition: all 0.3s ease;
        }

        .nav-category > h3:hover {
            background: rgba(102, 126, 234, 0.1);
            border-left-color: #764ba2;
        }

        .nav-toggle {
            display: inline-flex;
            align-items: center;
            justify-content: center;
            width: 18px;
            height: 18px;
            margin-right: 8px;
            background: rgba(102, 126, 234, 0.1);
            border-radius: 4px;
            font-size: 14px;
            color: #667eea;
            font-weight: bold;
            transition: all 0.3s ease;
            user-select: none;
        }

        .nav-category > h3:hover .nav-toggle {
            background: #667eea;
            color: white;
        }

        .nav-items {
            max-height: 0;
            overflow: hidden;
            transition: max-height 0.3s ease;
        }

        .nav-items.expanded {
            max-height: 500px;
        }

        .nav-category.expanded > h3 .nav-toggle {
            transform: rotate(45deg);
            background: #764ba2;
            color: white;
        }
    
    
        

欢迎来到我的网站

这是一个段落

这是另一个段落

链接
  • 项目1
  • 项目2
  • 项目3
""" # 创建 BeautifulSoup 对象 soup = BeautifulSoup(html, 'lxml') # 格式化输出 print(soup.prettify()) # 获取标题 print(f"标题: {soup.title.string}") # 获取第一个段落 print(f"第一个段落: {soup.p.string}") # 通过 ID 查找 print(f"ID为intro的段落: {soup.find(id='intro').string}")

查找元素

from bs4 import BeautifulSoup

html = """

标题

段落1

段落2

链接1 链接2
""" soup = BeautifulSoup(html, 'lxml') # find() 查找第一个匹配的元素 h2 = soup.find('h2') print(f"h2: {h2.string}") # find_all() 查找所有匹配的元素 paragraphs = soup.find_all('p', class_='text') for p in paragraphs: print(f"段落: {p.string}") # 查找所有链接 links = soup.find_all('a') for link in links: print(f"链接文本: {link.string}, URL: {link['href']}") # CSS 选择器 container = soup.select_one('.container') texts = soup.select('.text') print(f"选择器结果: {len(texts)} 个段落")

获取属性和文本

from bs4 import BeautifulSoup

html = """
点击这里
姓名: 张三 年龄: 25
""" soup = BeautifulSoup(html, 'lxml') # 获取属性 link = soup.find('a') print(f"href: {link['href']}") print(f"class: {link.get('class')}") print(f"id: {link.get('id')}") # 获取文本 print(f"链接文本: {link.string}") print(f"链接文本: {link.get_text()}") # 获取所有文本 div = soup.find('div', class_='info') print(f"所有文本: {div.get_text(strip=True)}")

遍历文档树

from bs4 import BeautifulSoup

html = """

段落1

段落2

嵌套内容
""" soup = BeautifulSoup(html, 'lxml') # 父元素 p = soup.find('p') print(f"父元素: {p.parent.name}") # 子元素 div = soup.find('div') for child in div.children: if child.name: print(f"子元素: {child.name}") # 后兄弟元素 first_p = soup.find('p') next_sibling = first_p.next_sibling print(f"下一个兄弟: {next_sibling}") # 所有兄弟元素 for sibling in first_p.next_siblings: if sibling.name: print(f"兄弟元素: {sibling.name}")

实际爬虫示例

import requests
from bs4 import BeautifulSoup

def scrape_quotes():
    url = 'https://quotes.toscrape.com/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')

    # 查找所有名言
    quotes = soup.find_all('div', class_='quote')

    for quote in quotes:
        text = quote.find('span', class_='text').string
        author = quote.find('small', class_='author').string
        tags = [tag.string for tag in quote.find_all('a', class_='tag')]

        print(f"名言: {text}")
        print(f"作者: {author}")
        print(f"标签: {', '.join(tags)}")
        print("-" * 50)

scrape_quotes()
💡 提示:BeautifulSoup 适合解析静态网页,对于动态网页需要配合 Selenium 或 Playwright 使用。
Selenium →