Step 04用 XPath 解析靜態 HTML(核心練習)python
555">#fc8181">import requests
555">#fc8181">from lxml import html
555"># 方法:先用 requests-html 或 Selenium 渲染 JS,再用 XPath
555"># 這裡示範用已渲染的 HTML(例如從 Selenium 獲取)
555"># 假設 page_source 是已渲染的 HTML 字符串
555"># page_source = driver.page_source # Selenium
555"># 解析 HTML
tree = html.fromstring(page_source)
555"># ── 用 XPath 提取每個帖子 ──
threads = tree.xpath(555">#f6e05e">'//div[contains(@class, "wQ4Ran")]')
results = []
555">#fc8181">for thread in threads:
555"># 用戶名
username = thread.xpath(
555">#f6e05e">'.//span[contains(@class, "CxY4XDSSI")]/text()'
)
555"># 發帖時間
time_posted = thread.xpath(
555">#f6e05e">'.//span[contains(@class, "_37XwjAqV")][1]/text()'
)
555"># 點讚數(注意:likes span 包含 <i> 圖標,用 //text() 取所有文字)
likes_parts = thread.xpath(
555">#f6e05e">'.//span[contains(@class, "_37XwjAqV")][2]//text()'
)
likes = 555">#f6e05e">''.join(likes_parts).strip()
555"># 頁數(text()[1] 跳過 <select> 內容)
pages = thread.xpath(
555">#f6e05e">'.//div[contains(@class, "_26oEXjfU")]/text()[1]'
)
555"># 帖子標題
title = thread.xpath(
555">#f6e05e">'.//span[contains(@class, "_20jopXBF")]/text()'
)
555"># 分類台名稱
category = thread.xpath(
555">#f6e05e">'.//a[contains(@class, "_3VRxq3mC")]/text()'
)
555"># 帖子 URL
thread_url = thread.xpath(
555">#f6e05e">'.//a[contains(@class, "_2A_7bGY9")]/@href'
)
results.append({
555">#f6e05e">"username": username[0].strip() if username else "",
555">#f6e05e">"time": time_posted[0].strip() if time_posted else "",
555">#f6e05e">"likes": likes,
555">#f6e05e">"pages": pages[0].strip() if pages else "1 頁",
555">#f6e05e">"title": title[0].strip() if title else "",
555">#f6e05e">"category": category[0].strip() if category else "",
555">#f6e05e">"thread_url": "https://lihkg.com" + thread_url[0] if thread_url else "",
})
555">#fc8181">print(f"提取了 {len(results)} 條帖子")
555">#fc8181">for r in results[:3]:
555">#fc8181">print(r)
💡contains(@class, ...) 是關鍵技巧——LIHKG 的 class 是 hashed,用 contains 而非完全匹配