fix qiushi bug
This commit is contained in:
@@ -236,12 +236,16 @@ def process_article(url, website):
|
|||||||
soup.find("div", class_="article")
|
soup.find("div", class_="article")
|
||||||
)
|
)
|
||||||
elif "求是" in website.name:
|
elif "求是" in website.name:
|
||||||
# 求是网站的文章结构处理 - 修复两个标题问题
|
# 求是网站的文章结构处理 - 修复标题和正文清理问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
soup.find("h1", class_="title") or
|
soup.find("h1", class_="title") or
|
||||||
|
soup.find("h2", class_="title") or
|
||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
|
soup.find("h2") or
|
||||||
|
soup.find("p", class_="title") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
|
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", class_="article-content") or
|
soup.find("div", class_="article-content") or
|
||||||
@@ -251,20 +255,19 @@ def process_article(url, website):
|
|||||||
soup.find("div", class_="article")
|
soup.find("div", class_="article")
|
||||||
)
|
)
|
||||||
|
|
||||||
# 针对求是网的特殊处理,清理内容中的重复标题和无关元素
|
|
||||||
if content_tag:
|
if content_tag:
|
||||||
# 移除重复标题:查找与文章标题相同的strong标签并移除
|
# 移除重复标题(放宽匹配条件,允许部分匹配)
|
||||||
if title_tag:
|
if title_tag:
|
||||||
title_text = title_tag.get_text(strip=True)
|
title_text = title_tag.get_text(strip=True)
|
||||||
# 查找内容中与标题相同的strong标签(通常出现在正文第一段)
|
if title_text:
|
||||||
for strong_tag in content_tag.find_all("strong"):
|
for strong_tag in content_tag.find_all("strong"):
|
||||||
if strong_tag.get_text().strip() == title_text:
|
strong_text = strong_tag.get_text(strip=True)
|
||||||
# 检查是否是正文第一段中的重复标题
|
if title_text in strong_text or strong_text in title_text:
|
||||||
parent_p = strong_tag.find_parent("p")
|
parent_p = strong_tag.find_parent("p")
|
||||||
if parent_p and parent_p == content_tag.find("p"):
|
# 如果 strong 在正文前两段内,就删除
|
||||||
|
if parent_p in content_tag.find_all("p")[:2]:
|
||||||
strong_tag.decompose()
|
strong_tag.decompose()
|
||||||
|
|
||||||
# 移除无关的元素
|
|
||||||
# 移除分享相关元素
|
# 移除分享相关元素
|
||||||
for share_element in content_tag.find_all("div", class_="sharebox"):
|
for share_element in content_tag.find_all("div", class_="sharebox"):
|
||||||
share_element.decompose()
|
share_element.decompose()
|
||||||
@@ -275,8 +278,8 @@ def process_article(url, website):
|
|||||||
|
|
||||||
# 移除编辑信息
|
# 移除编辑信息
|
||||||
for editor_element in content_tag.find_all("div", class_="fs-text"):
|
for editor_element in content_tag.find_all("div", class_="fs-text"):
|
||||||
if editor_element.get_text() and (
|
text = editor_element.get_text(strip=True)
|
||||||
"网站编辑" in editor_element.get_text() or "审核" in editor_element.get_text()):
|
if text and ("网站编辑" in text or "审核" in text):
|
||||||
editor_element.decompose()
|
editor_element.decompose()
|
||||||
|
|
||||||
# 移除声明链接
|
# 移除声明链接
|
||||||
@@ -290,18 +293,16 @@ def process_article(url, website):
|
|||||||
# 移除分隔线
|
# 移除分隔线
|
||||||
for line_element in content_tag.find_all("div", class_="fs-line"):
|
for line_element in content_tag.find_all("div", class_="fs-line"):
|
||||||
line_element.decompose()
|
line_element.decompose()
|
||||||
|
|
||||||
for line_element in content_tag.find_all("div", class_="fs-line_b"):
|
for line_element in content_tag.find_all("div", class_="fs-line_b"):
|
||||||
line_element.decompose()
|
line_element.decompose()
|
||||||
|
|
||||||
# 移除剪贴板相关元素
|
# unwrap 剪贴板相关元素(保留文字,去掉外层标签)
|
||||||
for clipboard_element in content_tag.find_all("div", class_="clipboard_text"):
|
for clipboard_element in content_tag.find_all("div", class_="clipboard_text"):
|
||||||
clipboard_element.unwrap() # unwrap只移除标签,保留内容
|
clipboard_element.unwrap()
|
||||||
|
|
||||||
# 移除highlight包装层,保留内容
|
# unwrap highlight 包装层(保留文字)
|
||||||
for highlight_element in content_tag.find_all("div", class_="highlight"):
|
for highlight_element in content_tag.find_all("div", class_="highlight"):
|
||||||
highlight_element.unwrap() # unwrap只移除标签,保留内容
|
highlight_element.unwrap()
|
||||||
|
|
||||||
elif "解放军报" in website.name or "81.cn" in website.name:
|
elif "解放军报" in website.name or "81.cn" in website.name:
|
||||||
# 解放军报的文章结构处理 - 修复类别爬取问题
|
# 解放军报的文章结构处理 - 修复类别爬取问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
|
|||||||
Reference in New Issue
Block a user