这是一个段落
这是另一个段落
链接- 项目1
- 项目2
- 项目3
BeautifulSoup 是一个 Python 库,用于从 HTML 或 XML 文件中提取数据,非常适合网页爬虫。
pip install beautifulsoup4
pip install lxml # 推荐使用 lxml 解析器
from bs4 import BeautifulSoup
html = """
测试页面
/* 二级导航样式 */
.nav-category {
margin-bottom: 10px;
}
.nav-category > h3 {
color: #764ba2;
font-size: 1em;
margin-bottom: 8px;
padding-left: 8px;
border-left: 3px solid #667eea;
cursor: pointer;
display: flex;
align-items: center;
transition: all 0.3s ease;
}
.nav-category > h3:hover {
background: rgba(102, 126, 234, 0.1);
border-left-color: #764ba2;
}
.nav-toggle {
display: inline-flex;
align-items: center;
justify-content: center;
width: 18px;
height: 18px;
margin-right: 8px;
background: rgba(102, 126, 234, 0.1);
border-radius: 4px;
font-size: 14px;
color: #667eea;
font-weight: bold;
transition: all 0.3s ease;
user-select: none;
}
.nav-category > h3:hover .nav-toggle {
background: #667eea;
color: white;
}
.nav-items {
max-height: 0;
overflow: hidden;
transition: max-height 0.3s ease;
}
.nav-items.expanded {
max-height: 500px;
}
.nav-category.expanded > h3 .nav-toggle {
transform: rotate(45deg);
background: #764ba2;
color: white;
}
欢迎来到我的网站
"""
# 创建 BeautifulSoup 对象
soup = BeautifulSoup(html, 'lxml')
# 格式化输出
print(soup.prettify())
# 获取标题
print(f"标题: {soup.title.string}")
# 获取第一个段落
print(f"第一个段落: {soup.p.string}")
# 通过 ID 查找
print(f"ID为intro的段落: {soup.find(id='intro').string}")
from bs4 import BeautifulSoup
html = """
"""
soup = BeautifulSoup(html, 'lxml')
# find() 查找第一个匹配的元素
h2 = soup.find('h2')
print(f"h2: {h2.string}")
# find_all() 查找所有匹配的元素
paragraphs = soup.find_all('p', class_='text')
for p in paragraphs:
print(f"段落: {p.string}")
# 查找所有链接
links = soup.find_all('a')
for link in links:
print(f"链接文本: {link.string}, URL: {link['href']}")
# CSS 选择器
container = soup.select_one('.container')
texts = soup.select('.text')
print(f"选择器结果: {len(texts)} 个段落")
from bs4 import BeautifulSoup
html = """
点击这里
姓名: 张三
年龄: 25
"""
soup = BeautifulSoup(html, 'lxml')
# 获取属性
link = soup.find('a')
print(f"href: {link['href']}")
print(f"class: {link.get('class')}")
print(f"id: {link.get('id')}")
# 获取文本
print(f"链接文本: {link.string}")
print(f"链接文本: {link.get_text()}")
# 获取所有文本
div = soup.find('div', class_='info')
print(f"所有文本: {div.get_text(strip=True)}")
from bs4 import BeautifulSoup
html = """
段落1
段落2
嵌套内容
"""
soup = BeautifulSoup(html, 'lxml')
# 父元素
p = soup.find('p')
print(f"父元素: {p.parent.name}")
# 子元素
div = soup.find('div')
for child in div.children:
if child.name:
print(f"子元素: {child.name}")
# 后兄弟元素
first_p = soup.find('p')
next_sibling = first_p.next_sibling
print(f"下一个兄弟: {next_sibling}")
# 所有兄弟元素
for sibling in first_p.next_siblings:
if sibling.name:
print(f"兄弟元素: {sibling.name}")
import requests
from bs4 import BeautifulSoup
def scrape_quotes():
url = 'https://quotes.toscrape.com/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
# 查找所有名言
quotes = soup.find_all('div', class_='quote')
for quote in quotes:
text = quote.find('span', class_='text').string
author = quote.find('small', class_='author').string
tags = [tag.string for tag in quote.find_all('a', class_='tag')]
print(f"名言: {text}")
print(f"作者: {author}")
print(f"标签: {', '.join(tags)}")
print("-" * 50)
scrape_quotes()