这是一个段落
这是另一个段落
链接- 项目1
- 项目2
- 项目3
BeautifulSoup 是一个 Python 库,用于从 HTML 或 XML 文件中提取数据,非常适合网页爬虫。
pip install beautifulsoup4
pip install lxml # 推荐使用 lxml 解析器
from bs4 import BeautifulSoup
html = """
测试页面
欢迎来到我的网站
"""
# 创建 BeautifulSoup 对象
soup = BeautifulSoup(html, 'lxml')
# 格式化输出
print(soup.prettify())
# 获取标题
print(f"标题: {soup.title.string}")
# 获取第一个段落
print(f"第一个段落: {soup.p.string}")
# 通过 ID 查找
print(f"ID为intro的段落: {soup.find(id='intro').string}")
from bs4 import BeautifulSoup
html = """
"""
soup = BeautifulSoup(html, 'lxml')
# find() 查找第一个匹配的元素
h2 = soup.find('h2')
print(f"h2: {h2.string}")
# find_all() 查找所有匹配的元素
paragraphs = soup.find_all('p', class_='text')
for p in paragraphs:
print(f"段落: {p.string}")
# 查找所有链接
links = soup.find_all('a')
for link in links:
print(f"链接文本: {link.string}, URL: {link['href']}")
# CSS 选择器
container = soup.select_one('.container')
texts = soup.select('.text')
print(f"选择器结果: {len(texts)} 个段落")
from bs4 import BeautifulSoup
html = """
点击这里
姓名: 张三
年龄: 25
"""
soup = BeautifulSoup(html, 'lxml')
# 获取属性
link = soup.find('a')
print(f"href: {link['href']}")
print(f"class: {link.get('class')}")
print(f"id: {link.get('id')}")
# 获取文本
print(f"链接文本: {link.string}")
print(f"链接文本: {link.get_text()}")
# 获取所有文本
div = soup.find('div', class_='info')
print(f"所有文本: {div.get_text(strip=True)}")
from bs4 import BeautifulSoup
html = """
段落1
段落2
嵌套内容
"""
soup = BeautifulSoup(html, 'lxml')
# 父元素
p = soup.find('p')
print(f"父元素: {p.parent.name}")
# 子元素
div = soup.find('div')
for child in div.children:
if child.name:
print(f"子元素: {child.name}")
# 后兄弟元素
first_p = soup.find('p')
next_sibling = first_p.next_sibling
print(f"下一个兄弟: {next_sibling}")
# 所有兄弟元素
for sibling in first_p.next_siblings:
if sibling.name:
print(f"兄弟元素: {sibling.name}")
import requests
from bs4 import BeautifulSoup
def scrape_quotes():
url = 'https://quotes.toscrape.com/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
# 查找所有名言
quotes = soup.find_all('div', class_='quote')
for quote in quotes:
text = quote.find('span', class_='text').string
author = quote.find('small', class_='author').string
tags = [tag.string for tag in quote.find_all('a', class_='tag')]
print(f"名言: {text}")
print(f"作者: {author}")
print(f"标签: {', '.join(tags)}")
print("-" * 50)
scrape_quotes()