← BeautifulSoup | Playwright →

Selenium

Selenium 是一个自动化测试工具,可以模拟浏览器操作,适合爬取动态网页。

安装

pip install selenium

基本使用

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

# 创建浏览器驱动(需要先下载对应浏览器的驱动)
driver = webdriver.Chrome()  # 或 webdriver.Firefox()

try:
    # 打开网页
    driver.get('https://www.baidu.com')

    # 查找元素
    search_box = driver.find_element(By.ID, 'kw')

    # 输入搜索内容
    search_box.send_keys('Python 爬虫')

    # 提交搜索
    search_box.send_keys(Keys.RETURN)

    # 等待页面加载
    time.sleep(2)

    # 获取搜索结果
    results = driver.find_elements(By.CSS_SELECTOR, '.result')
    for result in results[:5]:
        print(result.text)

finally:
    # 关闭浏览器
    driver.quit()

元素定位

from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get('https://example.com')

# 通过 ID
element = driver.find_element(By.ID, 'element_id')

# 通过 Name
element = driver.find_element(By.NAME, 'element_name')

# 通过 Class Name
element = driver.find_element(By.CLASS_NAME, 'element_class')

# 通过 Tag Name
element = driver.find_element(By.TAG_NAME, 'div')

# 通过 CSS Selector
element = driver.find_element(By.CSS_SELECTOR, '.class > div')

# 通过 XPath
element = driver.find_element(By.XPATH, '//div[@class="example"]')

# 通过 Link Text
element = driver.find_element(By.LINK_TEXT, '点击这里')

# 通过 Partial Link Text
element = driver.find_element(By.PARTIAL_LINK_TEXT, '点击')

driver.quit()

元素操作

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

driver = webdriver.Chrome()
driver.get('https://example.com')

# 点击按钮
button = driver.find_element(By.ID, 'submit')
button.click()

# 输入文本
input_field = driver.find_element(By.ID, 'username')
input_field.send_keys('张三')

# 清除输入
input_field.clear()

# 获取文本
text = element.text

# 获取属性
href = element.get_attribute('href')

# 下拉框选择
select = Select(driver.find_element(By.ID, 'country'))
select.select_by_visible_text('中国')
select.select_by_value('CN')
select.select_by_index(0)

# 滚动页面
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

driver.quit()

等待机制

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Chrome()
driver.get('https://example.com')

# 显式等待
try:
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, 'dynamic-element'))
    )
    print("元素已加载")
except:
    print("等待超时")

# 等待元素可点击
element = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.ID, 'button'))
)

# 等待文本出现
WebDriverWait(driver, 10).until(
    EC.text_to_be_present_in_element((By.ID, 'status'), '完成')
)

driver.quit()

处理弹窗和 iframe

from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get('https://example.com')

# 处理 Alert 弹窗
driver.find_element(By.ID, 'show-alert').click()
alert = driver.switch_to.alert
print(f"弹窗文本: {alert.text}")
alert.accept()  # 点击确定
# alert.dismiss()  # 点击取消

# 处理 Confirm 弹窗
driver.find_element(By.ID, 'show-confirm').click()
confirm = driver.switch_to.alert
confirm.dismiss()

# 处理 Prompt 弹窗
driver.find_element(By.ID, 'show-prompt').click()
prompt = driver.switch_to.alert
prompt.send_keys('输入内容')
prompt.accept()

# 切换到 iframe
driver.switch_to.frame('iframe_id')
# 在 iframe 中操作
driver.switch_to.default_content()  # 切换回主页面

driver.quit()

无头模式

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# 配置无头模式
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

driver = webdriver.Chrome(options=chrome_options)
driver.get('https://example.com')
print(driver.title)
driver.quit()
💡 提示:Selenium 适合需要 JavaScript 渲染的动态网页,但速度较慢,资源占用大。
← BeautifulSoup | Playwright →