fix some bug
This commit is contained in:
538
core/utils.py
538
core/utils.py
@@ -92,10 +92,85 @@ def process_article(url, website):
|
|||||||
"Upgrade-Insecure-Requests": "1",
|
"Upgrade-Insecure-Requests": "1",
|
||||||
"Referer": "https://www.gmw.cn/"
|
"Referer": "https://www.gmw.cn/"
|
||||||
})
|
})
|
||||||
|
# 添加央视网的特殊请求头
|
||||||
|
elif "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
|
||||||
|
headers.update({
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||||
|
"Accept-Encoding": "gzip, deflate",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
"Referer": "https://news.cctv.com/"
|
||||||
|
})
|
||||||
|
# 添加中国网的特殊请求头
|
||||||
|
elif "中国网" in website.name or "china.com.cn" in website.name:
|
||||||
|
headers.update({
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||||
|
"Accept-Encoding": "gzip, deflate",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
"Referer": "http://www.china.com.cn/"
|
||||||
|
})
|
||||||
|
# 添加法治日报的特殊请求头
|
||||||
|
elif "法治日报" in website.name or "legaldaily" in website.name:
|
||||||
|
headers.update({
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||||
|
"Accept-Encoding": "gzip, deflate",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
"Referer": "http://www.legaldaily.com.cn/"
|
||||||
|
})
|
||||||
|
# 添加工人日报的特殊请求头
|
||||||
|
elif "工人日报" in website.name or "workercn" in website.name:
|
||||||
|
headers.update({
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||||
|
"Accept-Encoding": "gzip, deflate",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
"Referer": "https://www.workercn.cn/"
|
||||||
|
})
|
||||||
|
# 添加农民日报的特殊请求头
|
||||||
|
elif "农民日报" in website.name or "farmer" in website.name:
|
||||||
|
headers.update({
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||||
|
"Accept-Encoding": "gzip, deflate",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
"Referer": "https://www.farmer.com.cn/"
|
||||||
|
})
|
||||||
|
# 添加解放军报的特殊请求头
|
||||||
|
elif "解放军报" in website.name or "81.cn" in website.name:
|
||||||
|
headers.update({
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||||
|
"Accept-Encoding": "gzip, deflate",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
"Referer": "http://www.81.cn/"
|
||||||
|
})
|
||||||
|
# 添加旗帜网的特殊请求头
|
||||||
|
elif "旗帜网" in website.name or "qizhiwang" in website.name:
|
||||||
|
headers.update({
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||||
|
"Accept-Encoding": "gzip, deflate",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
"Referer": "http://www.qizhiwang.org.cn/"
|
||||||
|
})
|
||||||
|
|
||||||
try:
|
try:
|
||||||
resp = requests.get(url, headers=headers, timeout=15)
|
resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
# 检查是否是重定向页面
|
||||||
|
if len(resp.text) < 500:
|
||||||
|
print(f"页面内容过短,可能是重定向页面:{url}")
|
||||||
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"请求失败:{url},错误:{e}")
|
print(f"请求失败:{url},错误:{e}")
|
||||||
return
|
return
|
||||||
@@ -107,6 +182,14 @@ def process_article(url, website):
|
|||||||
resp.encoding = 'utf-8'
|
resp.encoding = 'utf-8'
|
||||||
elif "央视" in website.name or "CCTV" in website.name:
|
elif "央视" in website.name or "CCTV" in website.name:
|
||||||
resp.encoding = 'utf-8'
|
resp.encoding = 'utf-8'
|
||||||
|
elif "农民日报" in website.name or "farmer" in website.name:
|
||||||
|
resp.encoding = 'utf-8'
|
||||||
|
# 尝试其他编码
|
||||||
|
if '' in resp.text or len(resp.text) < 1000:
|
||||||
|
resp.encoding = 'gbk'
|
||||||
|
# 进一步尝试其他编码
|
||||||
|
if '' in resp.text or '' in resp.text:
|
||||||
|
resp.encoding = 'gb2312'
|
||||||
else:
|
else:
|
||||||
resp.encoding = 'utf-8'
|
resp.encoding = 'utf-8'
|
||||||
|
|
||||||
@@ -146,7 +229,7 @@ def process_article(url, website):
|
|||||||
if not content_tag:
|
if not content_tag:
|
||||||
content_tag = soup.find("div", id="ContentText")
|
content_tag = soup.find("div", id="ContentText")
|
||||||
elif website.name == "www.gov.cn":
|
elif website.name == "www.gov.cn":
|
||||||
# 中国政府网的文章结构处理 - 修复两个标题问题
|
# 中国政府网的文章结构处理 - 修复标题重复问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
soup.find("h1", class_="title") or
|
soup.find("h1", class_="title") or
|
||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
@@ -160,6 +243,38 @@ def process_article(url, website):
|
|||||||
soup.find("div", id="content") or
|
soup.find("div", id="content") or
|
||||||
soup.find("div", class_="mainBody")
|
soup.find("div", class_="mainBody")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对中国政府网的特殊处理,清理内容中的重复标题
|
||||||
|
if content_tag and title_tag:
|
||||||
|
title_text = title_tag.get_text(strip=True)
|
||||||
|
if title_text:
|
||||||
|
# 移除内容中的重复标题元素
|
||||||
|
for heading in content_tag.find_all(["h1", "h2", "h3"]):
|
||||||
|
heading_text = heading.get_text(strip=True)
|
||||||
|
if title_text in heading_text or heading_text in title_text:
|
||||||
|
heading.decompose()
|
||||||
|
|
||||||
|
# 移除class包含title的元素
|
||||||
|
for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
|
||||||
|
title_element_text = title_element.get_text(strip=True)
|
||||||
|
if title_text in title_element_text or title_element_text in title_text:
|
||||||
|
title_element.decompose()
|
||||||
|
|
||||||
|
# 移除编辑信息
|
||||||
|
for editor_element in content_tag.find_all("div", class_="editor"):
|
||||||
|
editor_element.decompose()
|
||||||
|
|
||||||
|
# 移除分享相关元素
|
||||||
|
for share_element in content_tag.find_all("div", class_="share"):
|
||||||
|
share_element.decompose()
|
||||||
|
|
||||||
|
# 移除script标签
|
||||||
|
for script_element in content_tag.find_all("script"):
|
||||||
|
script_element.decompose()
|
||||||
|
|
||||||
|
# 移除样式标签
|
||||||
|
for style_element in content_tag.find_all("style"):
|
||||||
|
style_element.decompose()
|
||||||
elif "人民日报" in website.name or "人民网" in website.name:
|
elif "人民日报" in website.name or "人民网" in website.name:
|
||||||
# 人民日报网站的文章结构处理 - 修复乱码和404问题
|
# 人民日报网站的文章结构处理 - 修复乱码和404问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
@@ -220,21 +335,50 @@ def process_article(url, website):
|
|||||||
# 移除相关专题列表
|
# 移除相关专题列表
|
||||||
for topic_element in content_tag.find_all("div", class_="clearfix text_like"):
|
for topic_element in content_tag.find_all("div", class_="clearfix text_like"):
|
||||||
topic_element.decompose()
|
topic_element.decompose()
|
||||||
elif "央视" in website.name or "CCTV" in website.name:
|
elif "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
|
||||||
# 央视网站的文章结构处理 - 修复视频下载问题
|
# 央视网站的文章结构处理 - 修复不保存正文和图片问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
soup.find("h1", class_="title") or
|
soup.find("h1", class_="title") or
|
||||||
|
soup.find("h1", class_="title_text") or # 央视网特有标题类
|
||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
content_tag = (
|
content_tag = (
|
||||||
|
soup.find("div", class_="content_area") or # 央视网特有内容容器
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", class_="article-content") or
|
soup.find("div", class_="article-content") or
|
||||||
soup.find("div", id="content") or
|
soup.find("div", id="content") or
|
||||||
soup.find("div", class_="text") or
|
soup.find("div", class_="text") or
|
||||||
soup.find("div", class_="main-content") or
|
soup.find("div", class_="main-content") or
|
||||||
soup.find("div", class_="article")
|
soup.find("div", class_="article") or
|
||||||
|
soup.find("div", class_="article-body")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对央视网的特殊处理,清理内容中的无关元素
|
||||||
|
if content_tag:
|
||||||
|
# 移除编辑信息
|
||||||
|
for editor_element in content_tag.find_all("div", class_="editor"):
|
||||||
|
editor_element.decompose()
|
||||||
|
|
||||||
|
# 移除分享相关元素
|
||||||
|
for share_element in content_tag.find_all("div", class_="share"):
|
||||||
|
share_element.decompose()
|
||||||
|
|
||||||
|
# 移除相关推荐
|
||||||
|
for recommend_element in content_tag.find_all("div", class_="related"):
|
||||||
|
recommend_element.decompose()
|
||||||
|
|
||||||
|
# 移除评论区域
|
||||||
|
for comment_element in content_tag.find_all("div", class_="comment"):
|
||||||
|
comment_element.decompose()
|
||||||
|
|
||||||
|
# 移除script标签
|
||||||
|
for script_element in content_tag.find_all("script"):
|
||||||
|
script_element.decompose()
|
||||||
|
|
||||||
|
# 移除样式标签
|
||||||
|
for style_element in content_tag.find_all("style"):
|
||||||
|
style_element.decompose()
|
||||||
elif "求是" in website.name:
|
elif "求是" in website.name:
|
||||||
# 求是网站的文章结构处理 - 修复标题和正文清理问题
|
# 求是网站的文章结构处理 - 修复标题和正文清理问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
@@ -246,6 +390,16 @@ def process_article(url, website):
|
|||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对求是的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||||
|
if title_tag:
|
||||||
|
title_text = title_tag.get_text(strip=True)
|
||||||
|
if not title_text or len(title_text) < 5:
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
|
||||||
|
# 针对求是的特殊处理,确保标题被正确提取
|
||||||
|
if not title_tag or not title_tag.get_text(strip=True):
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", class_="article-content") or
|
soup.find("div", class_="article-content") or
|
||||||
@@ -260,14 +414,31 @@ def process_article(url, website):
|
|||||||
if title_tag:
|
if title_tag:
|
||||||
title_text = title_tag.get_text(strip=True)
|
title_text = title_tag.get_text(strip=True)
|
||||||
if title_text:
|
if title_text:
|
||||||
|
# 移除所有可能的重复标题元素
|
||||||
for strong_tag in content_tag.find_all("strong"):
|
for strong_tag in content_tag.find_all("strong"):
|
||||||
strong_text = strong_tag.get_text(strip=True)
|
strong_text = strong_tag.get_text(strip=True)
|
||||||
if title_text in strong_text or strong_text in title_text:
|
if strong_text and (title_text in strong_text or strong_text in title_text):
|
||||||
parent_p = strong_tag.find_parent("p")
|
parent_p = strong_tag.find_parent("p")
|
||||||
# 如果 strong 在正文前两段内,就删除
|
# 如果 strong 在正文前两段内,就删除
|
||||||
if parent_p in content_tag.find_all("p")[:2]:
|
if parent_p in content_tag.find_all("p")[:2]:
|
||||||
strong_tag.decompose()
|
strong_tag.decompose()
|
||||||
|
|
||||||
|
# 移除h1、h2、h3标题元素中的重复标题
|
||||||
|
for heading in content_tag.find_all(["h1", "h2", "h3"]):
|
||||||
|
heading_text = heading.get_text(strip=True)
|
||||||
|
if heading_text and (title_text in heading_text or heading_text in title_text):
|
||||||
|
# 确保不删除title_tag本身
|
||||||
|
if heading != title_tag:
|
||||||
|
heading.decompose()
|
||||||
|
|
||||||
|
# 移除class包含title的元素
|
||||||
|
for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
|
||||||
|
title_element_text = title_element.get_text(strip=True)
|
||||||
|
if title_element_text and (title_text in title_element_text or title_element_text in title_text):
|
||||||
|
# 确保不删除title_tag本身
|
||||||
|
if title_element != title_tag:
|
||||||
|
title_element.decompose()
|
||||||
|
|
||||||
# 移除分享相关元素
|
# 移除分享相关元素
|
||||||
for share_element in content_tag.find_all("div", class_="sharebox"):
|
for share_element in content_tag.find_all("div", class_="sharebox"):
|
||||||
share_element.decompose()
|
share_element.decompose()
|
||||||
@@ -304,20 +475,84 @@ def process_article(url, website):
|
|||||||
for highlight_element in content_tag.find_all("div", class_="highlight"):
|
for highlight_element in content_tag.find_all("div", class_="highlight"):
|
||||||
highlight_element.unwrap()
|
highlight_element.unwrap()
|
||||||
elif "解放军报" in website.name or "81.cn" in website.name:
|
elif "解放军报" in website.name or "81.cn" in website.name:
|
||||||
# 解放军报的文章结构处理 - 修复类别爬取问题
|
# 解放军报的文章结构处理 - 修复有视频的文章不被爬取问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
soup.find("h1", class_="title") or
|
soup.find("h1", class_="title") or
|
||||||
|
soup.find("h1", class_="article-title") or # 解放军报特有标题类
|
||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
|
soup.find("h2") or # 解放军报使用h2标签作为标题
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对解放军报的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||||
|
if title_tag:
|
||||||
|
title_text = title_tag.get_text(strip=True)
|
||||||
|
if not title_text or len(title_text) < 5:
|
||||||
|
title_tag = soup.find("title")
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", id="article-content") or # 解放军报实际文章内容容器
|
||||||
soup.find("div", class_="article-content") or
|
|
||||||
soup.find("div", id="content") or
|
|
||||||
soup.find("div", class_="text") or
|
soup.find("div", class_="text") or
|
||||||
|
soup.find("div", class_="article-content") or
|
||||||
|
soup.find("div", class_="content") or
|
||||||
|
soup.find("div", id="content") or
|
||||||
soup.find("div", class_="main-content") or
|
soup.find("div", class_="main-content") or
|
||||||
soup.find("div", class_="article")
|
soup.find("div", class_="article") or
|
||||||
|
soup.find("div", class_="article-body") or
|
||||||
|
soup.find("div", class_="artichle-info") # 作为备选
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对解放军报的特殊处理,清理内容中的无关元素
|
||||||
|
if content_tag:
|
||||||
|
# 移除面包屑导航
|
||||||
|
for breadcrumb in content_tag.find_all("ol", class_="breadcrumb"):
|
||||||
|
breadcrumb.decompose()
|
||||||
|
|
||||||
|
# 移除分享相关元素
|
||||||
|
for share_element in content_tag.find_all("div", class_="share-custom"):
|
||||||
|
share_element.decompose()
|
||||||
|
|
||||||
|
# 移除作者信息段落
|
||||||
|
for author_p in content_tag.find_all("p"):
|
||||||
|
text = author_p.get_text(strip=True)
|
||||||
|
if "来源:" in text or "作者:" in text or "责任编辑:" in text or "发布:" in text:
|
||||||
|
author_p.decompose()
|
||||||
|
|
||||||
|
# 移除进度条
|
||||||
|
for progress in content_tag.find_all("div", class_="progress-bar"):
|
||||||
|
progress.decompose()
|
||||||
|
|
||||||
|
# 移除播放器
|
||||||
|
for player in content_tag.find_all("div", class_="player"):
|
||||||
|
player.decompose()
|
||||||
|
|
||||||
|
# 移除媒体URL容器
|
||||||
|
for media in content_tag.find_all("div", id="mediaurl"):
|
||||||
|
media.decompose()
|
||||||
|
|
||||||
|
# 移除新闻列表(但保留其中的内容)
|
||||||
|
for news_list in content_tag.find_all("ul", id="main-news-list"):
|
||||||
|
# 不删除整个ul,而是unwrap它,保留其中的内容
|
||||||
|
news_list.unwrap()
|
||||||
|
|
||||||
|
# 移除编辑信息
|
||||||
|
for editor_element in content_tag.find_all("div", class_="editor"):
|
||||||
|
editor_element.decompose()
|
||||||
|
|
||||||
|
# 移除分享相关元素
|
||||||
|
for share_element in content_tag.find_all("div", class_="share"):
|
||||||
|
share_element.decompose()
|
||||||
|
|
||||||
|
# 移除相关推荐
|
||||||
|
for recommend_element in content_tag.find_all("div", class_="related"):
|
||||||
|
recommend_element.decompose()
|
||||||
|
|
||||||
|
# 移除script标签
|
||||||
|
for script_element in content_tag.find_all("script"):
|
||||||
|
script_element.decompose()
|
||||||
|
|
||||||
|
# 移除样式标签
|
||||||
|
for style_element in content_tag.find_all("style"):
|
||||||
|
style_element.decompose()
|
||||||
elif "光明日报" in website.name or "gmw.cn" in website.name:
|
elif "光明日报" in website.name or "gmw.cn" in website.name:
|
||||||
# 光明日报的文章结构处理 - 修复不保存文章内容问题
|
# 光明日报的文章结构处理 - 修复不保存文章内容问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
@@ -402,21 +637,57 @@ def process_article(url, website):
|
|||||||
# 更新content_tag为包含所有分页内容
|
# 更新content_tag为包含所有分页内容
|
||||||
content_tag = BeautifulSoup(all_content_html, "html.parser")
|
content_tag = BeautifulSoup(all_content_html, "html.parser")
|
||||||
elif "工人日报" in website.name or "workercn" in website.name:
|
elif "工人日报" in website.name or "workercn" in website.name:
|
||||||
# 工人日报的文章结构处理 - 修复不保存文章内容问题
|
# 工人日报的文章结构处理 - 修复不保存正文和图片问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
soup.find("h1", class_="title") or
|
soup.find("h1", class_="title") or
|
||||||
|
soup.find("h1", class_="article-title") or # 工人日报特有标题类
|
||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对工人日报的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||||
|
if title_tag:
|
||||||
|
title_text = title_tag.get_text(strip=True)
|
||||||
|
if not title_text or len(title_text) < 5:
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
|
||||||
|
# 进一步处理:如果h1标题包含太多无关信息,尝试从title标签提取更简洁的标题
|
||||||
|
if title_tag and title_tag.name == 'h1':
|
||||||
|
title_text = title_tag.get_text(strip=True)
|
||||||
|
if title_text and len(title_text) > 50: # 如果h1标题太长
|
||||||
|
title_tag = soup.find("title")
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="ccontent") or # 工人日报特有内容容器
|
||||||
soup.find("div", class_="article-content") or
|
soup.find("div", class_="article-content") or
|
||||||
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", id="content") or
|
soup.find("div", id="content") or
|
||||||
soup.find("div", class_="text") or
|
soup.find("div", class_="text") or
|
||||||
soup.find("div", class_="main-content") or
|
soup.find("div", class_="main-content") or
|
||||||
soup.find("div", class_="article") or
|
soup.find("div", class_="article") or
|
||||||
soup.find("div", class_="article-body")
|
soup.find("div", class_="article-body")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对工人日报的特殊处理,清理内容中的无关元素
|
||||||
|
if content_tag:
|
||||||
|
# 移除编辑信息
|
||||||
|
for editor_element in content_tag.find_all("div", class_="editor"):
|
||||||
|
editor_element.decompose()
|
||||||
|
|
||||||
|
# 移除分享相关元素
|
||||||
|
for share_element in content_tag.find_all("div", class_="share"):
|
||||||
|
share_element.decompose()
|
||||||
|
|
||||||
|
# 移除相关推荐
|
||||||
|
for recommend_element in content_tag.find_all("div", class_="related"):
|
||||||
|
recommend_element.decompose()
|
||||||
|
|
||||||
|
# 移除script标签
|
||||||
|
for script_element in content_tag.find_all("script"):
|
||||||
|
script_element.decompose()
|
||||||
|
|
||||||
|
# 移除样式标签
|
||||||
|
for style_element in content_tag.find_all("style"):
|
||||||
|
style_element.decompose()
|
||||||
elif "科技日报" in website.name or "stdaily" in website.name:
|
elif "科技日报" in website.name or "stdaily" in website.name:
|
||||||
# 科技日报的文章结构处理 - 修复无法爬取问题
|
# 科技日报的文章结构处理 - 修复无法爬取问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
@@ -602,16 +873,17 @@ def process_article(url, website):
|
|||||||
soup.find("div", class_="article-body")
|
soup.find("div", class_="article-body")
|
||||||
)
|
)
|
||||||
elif "法治日报" in website.name or "legaldaily" in website.name:
|
elif "法治日报" in website.name or "legaldaily" in website.name:
|
||||||
# 法治日报的文章结构处理 - 修复无法爬取问题
|
# 法治日报的文章结构处理 - 修复不保存正文和图片问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
soup.find("h1", class_="title") or
|
soup.find("h1", class_="title") or
|
||||||
|
soup.find("h1", class_="article-title") or # 法治日报特有标题类
|
||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="content-two") or # 优先查找content-two类
|
soup.find("div", class_="content-two") or # 优先查找content-two类
|
||||||
|
soup.find("div", class_="article-content") or # 法治日报特有内容容器
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", class_="article-content") or
|
|
||||||
soup.find("div", id="content") or
|
soup.find("div", id="content") or
|
||||||
soup.find("div", class_="text") or
|
soup.find("div", class_="text") or
|
||||||
soup.find("div", class_="main-content") or
|
soup.find("div", class_="main-content") or
|
||||||
@@ -619,30 +891,91 @@ def process_article(url, website):
|
|||||||
soup.find("div", class_="article-body")
|
soup.find("div", class_="article-body")
|
||||||
)
|
)
|
||||||
|
|
||||||
# 如果找到content-two,需要进一步处理去除内部的标题元素(避免重复)
|
# 针对法治日报的特殊处理,清理内容中的无关元素
|
||||||
if content_tag and content_tag.get('class') and 'content-two' in content_tag.get('class', []):
|
if content_tag:
|
||||||
# 查找并移除内容中的标题元素(避免重复)
|
# 如果找到content-two,需要进一步处理去除内部的标题元素(避免重复)
|
||||||
inner_titles = content_tag.find_all(['h1', 'h2'])
|
if content_tag.get('class') and 'content-two' in content_tag.get('class', []):
|
||||||
title_text = title_tag.get_text(strip=True) if title_tag else "无标题"
|
# 查找并移除内容中的标题元素(避免重复)
|
||||||
for inner_title in inner_titles:
|
inner_titles = content_tag.find_all(['h1', 'h2'])
|
||||||
if inner_title.get_text().strip() == title_text:
|
title_text = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||||||
inner_title.decompose()
|
for inner_title in inner_titles:
|
||||||
|
if inner_title.get_text().strip() == title_text:
|
||||||
|
inner_title.decompose()
|
||||||
|
|
||||||
|
# 移除编辑信息
|
||||||
|
for editor_element in content_tag.find_all("div", class_="editor"):
|
||||||
|
editor_element.decompose()
|
||||||
|
|
||||||
|
# 移除分享相关元素
|
||||||
|
for share_element in content_tag.find_all("div", class_="share"):
|
||||||
|
share_element.decompose()
|
||||||
|
|
||||||
|
# 移除相关推荐
|
||||||
|
for recommend_element in content_tag.find_all("div", class_="related"):
|
||||||
|
recommend_element.decompose()
|
||||||
|
|
||||||
|
# 移除script标签
|
||||||
|
for script_element in content_tag.find_all("script"):
|
||||||
|
script_element.decompose()
|
||||||
|
|
||||||
|
# 移除样式标签
|
||||||
|
for style_element in content_tag.find_all("style"):
|
||||||
|
style_element.decompose()
|
||||||
elif "农民日报" in website.name or "farmer" in website.name:
|
elif "农民日报" in website.name or "farmer" in website.name:
|
||||||
# 农民日报的文章结构处理 - 修复正文未被爬取问题
|
# 农民日报的文章结构处理 - 修复不保存正文和图片问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
soup.find("h1", class_="title") or
|
soup.find("h1", class_="title") or
|
||||||
|
soup.find("h1", class_="article-title") or # 农民日报特有标题类
|
||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对农民日报的特殊处理,如果标题出现乱码,尝试从title标签提取
|
||||||
|
if title_tag and title_tag.name == 'h1':
|
||||||
|
title_text = title_tag.get_text(strip=True)
|
||||||
|
if title_text and any(char in title_text for char in ['', '', '']):
|
||||||
|
title_tag = soup.find("title")
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="detailCon") or # 农民日报特有内容容器
|
||||||
soup.find("div", class_="article-content") or
|
soup.find("div", class_="article-content") or
|
||||||
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", id="content") or
|
soup.find("div", id="content") or
|
||||||
soup.find("div", class_="text") or
|
soup.find("div", class_="text") or
|
||||||
soup.find("div", class_="main-content") or
|
soup.find("div", class_="main-content") or
|
||||||
soup.find("div", class_="article") or
|
soup.find("div", class_="article") or
|
||||||
soup.find("div", class_="article-body")
|
soup.find("div", class_="article-body")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对农民日报的特殊处理,如果找到多个detailCon,选择内容最长的那个
|
||||||
|
if content_tag and content_tag.get('class') and 'detailCon' in content_tag.get('class', []):
|
||||||
|
detail_cons = soup.find_all("div", class_="detailCon")
|
||||||
|
if len(detail_cons) > 1:
|
||||||
|
# 选择内容最长的detailCon
|
||||||
|
longest_content = max(detail_cons, key=lambda x: len(x.get_text(strip=True)))
|
||||||
|
if len(longest_content.get_text(strip=True)) > len(content_tag.get_text(strip=True)):
|
||||||
|
content_tag = longest_content
|
||||||
|
|
||||||
|
# 针对农民日报的特殊处理,清理内容中的无关元素
|
||||||
|
if content_tag:
|
||||||
|
# 移除编辑信息
|
||||||
|
for editor_element in content_tag.find_all("div", class_="editor"):
|
||||||
|
editor_element.decompose()
|
||||||
|
|
||||||
|
# 移除分享相关元素
|
||||||
|
for share_element in content_tag.find_all("div", class_="share"):
|
||||||
|
share_element.decompose()
|
||||||
|
|
||||||
|
# 移除相关推荐
|
||||||
|
for recommend_element in content_tag.find_all("div", class_="related"):
|
||||||
|
recommend_element.decompose()
|
||||||
|
|
||||||
|
# 移除script标签
|
||||||
|
for script_element in content_tag.find_all("script"):
|
||||||
|
script_element.decompose()
|
||||||
|
|
||||||
|
# 移除样式标签
|
||||||
|
for style_element in content_tag.find_all("style"):
|
||||||
|
style_element.decompose()
|
||||||
elif "学习强国" in website.name or "xuexi" in website.name:
|
elif "学习强国" in website.name or "xuexi" in website.name:
|
||||||
# 学习强国的文章结构处理 - 修复无法爬取问题
|
# 学习强国的文章结构处理 - 修复无法爬取问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
@@ -660,18 +993,19 @@ def process_article(url, website):
|
|||||||
soup.find("div", class_="article-body")
|
soup.find("div", class_="article-body")
|
||||||
)
|
)
|
||||||
elif "旗帜网" in website.name or "qizhiwang" in website.name:
|
elif "旗帜网" in website.name or "qizhiwang" in website.name:
|
||||||
# 旗帜网的文章结构处理 - 修复不保存文章内容问题
|
# 旗帜网的文章结构处理 - 修复不保存正文和图片问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
soup.find("div", class_="w1200 flag-text-tit clearfix") and
|
soup.find("div", class_="w1200 flag-text-tit clearfix") and
|
||||||
soup.find("div", class_="w1200 flag-text-tit clearfix").find("h1") or
|
soup.find("div", class_="w1200 flag-text-tit clearfix").find("h1") or
|
||||||
soup.find("h1", class_="title") or
|
soup.find("h1", class_="title") or
|
||||||
|
soup.find("h1", class_="article-title") or # 旗帜网特有标题类
|
||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="w1200 flag-text-con clearfix") or # 旗帜网特有内容容器
|
soup.find("div", class_="w1200 flag-text-con clearfix") or # 旗帜网特有内容容器
|
||||||
|
soup.find("div", class_="article-content") or # 旗帜网特有内容容器
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", class_="article-content") or
|
|
||||||
soup.find("div", id="content") or
|
soup.find("div", id="content") or
|
||||||
soup.find("div", class_="text") or
|
soup.find("div", class_="text") or
|
||||||
soup.find("div", class_="main-content") or
|
soup.find("div", class_="main-content") or
|
||||||
@@ -679,27 +1013,99 @@ def process_article(url, website):
|
|||||||
soup.find("div", class_="article-body")
|
soup.find("div", class_="article-body")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对旗帜网的特殊处理,清理内容中的无关元素
|
||||||
|
if content_tag:
|
||||||
|
# 移除编辑信息
|
||||||
|
for editor_element in content_tag.find_all("div", class_="editor"):
|
||||||
|
editor_element.decompose()
|
||||||
|
|
||||||
|
# 移除分享相关元素
|
||||||
|
for share_element in content_tag.find_all("div", class_="share"):
|
||||||
|
share_element.decompose()
|
||||||
|
|
||||||
|
# 移除相关推荐
|
||||||
|
for recommend_element in content_tag.find_all("div", class_="related"):
|
||||||
|
recommend_element.decompose()
|
||||||
|
|
||||||
|
# 移除script标签
|
||||||
|
for script_element in content_tag.find_all("script"):
|
||||||
|
script_element.decompose()
|
||||||
|
|
||||||
|
# 移除样式标签
|
||||||
|
for style_element in content_tag.find_all("style"):
|
||||||
|
style_element.decompose()
|
||||||
|
|
||||||
|
# 针对旗帜网的特殊处理,清理内容中的无关元素
|
||||||
|
if content_tag:
|
||||||
|
# 移除编辑信息
|
||||||
|
for editor_element in content_tag.find_all("div", class_="editor"):
|
||||||
|
editor_element.decompose()
|
||||||
|
|
||||||
|
# 移除分享相关元素
|
||||||
|
for share_element in content_tag.find_all("div", class_="share"):
|
||||||
|
share_element.decompose()
|
||||||
|
|
||||||
|
# 移除相关推荐
|
||||||
|
for recommend_element in content_tag.find_all("div", class_="related"):
|
||||||
|
recommend_element.decompose()
|
||||||
|
|
||||||
|
# 移除script标签
|
||||||
|
for script_element in content_tag.find_all("script"):
|
||||||
|
script_element.decompose()
|
||||||
|
|
||||||
|
# 移除样式标签
|
||||||
|
for style_element in content_tag.find_all("style"):
|
||||||
|
style_element.decompose()
|
||||||
|
|
||||||
elif "中国网" in website.name or "china.com.cn" in website.name:
|
elif "中国网" in website.name or "china.com.cn" in website.name:
|
||||||
# 中国网的文章结构处理 - 修复不保存文章内容问题
|
# 中国网的文章结构处理 - 修复不保存正文和图片问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
soup.find("h1", class_="title") or
|
soup.find("h1", class_="title") or
|
||||||
|
soup.find("h1", class_="article-title") or # 中国网特有标题类
|
||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="main") or # 中国网特有内容容器
|
||||||
soup.find("div", class_="article-content") or
|
soup.find("div", class_="article-content") or
|
||||||
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", id="content") or
|
soup.find("div", id="content") or
|
||||||
soup.find("div", class_="text") or
|
soup.find("div", class_="text") or
|
||||||
soup.find("div", class_="main-content") or
|
soup.find("div", class_="main-content") or
|
||||||
soup.find("div", class_="article") or
|
soup.find("div", class_="article") or
|
||||||
soup.find("div", class_="article-body")
|
soup.find("div", class_="article-body")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对中国网的特殊处理,清理内容中的无关元素
|
||||||
|
if content_tag:
|
||||||
|
# 移除编辑信息
|
||||||
|
for editor_element in content_tag.find_all("div", class_="editor"):
|
||||||
|
editor_element.decompose()
|
||||||
|
|
||||||
|
# 移除分享相关元素
|
||||||
|
for share_element in content_tag.find_all("div", class_="share"):
|
||||||
|
share_element.decompose()
|
||||||
|
|
||||||
|
# 移除相关推荐
|
||||||
|
for recommend_element in content_tag.find_all("div", class_="related"):
|
||||||
|
recommend_element.decompose()
|
||||||
|
|
||||||
|
# 移除script标签
|
||||||
|
for script_element in content_tag.find_all("script"):
|
||||||
|
script_element.decompose()
|
||||||
|
|
||||||
|
# 移除样式标签
|
||||||
|
for style_element in content_tag.find_all("style"):
|
||||||
|
style_element.decompose()
|
||||||
else:
|
else:
|
||||||
# 默认处理方式
|
# 默认处理方式
|
||||||
title_tag = soup.find("h1") or soup.find("title")
|
title_tag = soup.find("h1") or soup.find("title")
|
||||||
content_tag = soup.find("div", class_="content") or soup.find("div", id="content")
|
content_tag = soup.find("div", class_="content") or soup.find("div", id="content")
|
||||||
|
|
||||||
|
# 最终标题处理 - 只有在没有网站特定处理时才使用默认处理
|
||||||
|
if not title_tag:
|
||||||
|
title_tag = soup.find("h1") or soup.find("title")
|
||||||
|
|
||||||
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||||||
|
|
||||||
# 对标题进行额外处理,去除可能的多余空白字符
|
# 对标题进行额外处理,去除可能的多余空白字符
|
||||||
@@ -776,7 +1182,7 @@ def process_article(url, website):
|
|||||||
src = urljoin(url, src)
|
src = urljoin(url, src)
|
||||||
|
|
||||||
# 针对央视网等特殊处理
|
# 针对央视网等特殊处理
|
||||||
if "央视" in website.name or "CCTV" in website.name:
|
if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
|
||||||
# 央视网视频可能需要特殊处理
|
# 央视网视频可能需要特殊处理
|
||||||
if "cctv.com" in src or "cntv.cn" in src:
|
if "cctv.com" in src or "cntv.cn" in src:
|
||||||
print(f"发现央视视频: {src}")
|
print(f"发现央视视频: {src}")
|
||||||
@@ -785,6 +1191,13 @@ def process_article(url, website):
|
|||||||
elif "新华网" in website.name:
|
elif "新华网" in website.name:
|
||||||
print(f"发现新华网视频: {src}")
|
print(f"发现新华网视频: {src}")
|
||||||
|
|
||||||
|
# 针对解放军报的特殊处理
|
||||||
|
elif "解放军报" in website.name or "81.cn" in website.name:
|
||||||
|
print(f"发现解放军报视频: {src}")
|
||||||
|
# 解放军报视频可能需要特殊处理
|
||||||
|
if "81.cn" in src:
|
||||||
|
print(f"处理解放军报视频: {src}")
|
||||||
|
|
||||||
local_path = download_media(src, save_dir)
|
local_path = download_media(src, save_dir)
|
||||||
if local_path:
|
if local_path:
|
||||||
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
|
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
|
||||||
@@ -920,11 +1333,13 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
|||||||
# 增加对peopleapp.com特定文章路径的判断
|
# 增加对peopleapp.com特定文章路径的判断
|
||||||
("/dynamic/" in path and "article" in path)
|
("/dynamic/" in path and "article" in path)
|
||||||
)
|
)
|
||||||
elif "央视" in website.name or "CCTV" in website.name:
|
elif "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
|
||||||
# 央视网站的文章页面判断逻辑 - 修复视频下载问题
|
# 央视网站的文章页面判断逻辑 - 修复不保存正文和图片问题
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
path = parsed_url.path
|
path = parsed_url.path
|
||||||
is_article_page = (
|
is_article_page = (
|
||||||
|
(soup.find("div", class_="content_area") is not None and
|
||||||
|
soup.find("h1") is not None) or # 央视网特有内容容器
|
||||||
(soup.find("div", class_="content") is not None and
|
(soup.find("div", class_="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="article-content") is not None or
|
soup.find("div", class_="article-content") is not None or
|
||||||
@@ -933,9 +1348,11 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
|||||||
soup.find("div", class_="text") is not None or
|
soup.find("div", class_="text") is not None or
|
||||||
soup.find("div", class_="main-content") is not None or
|
soup.find("div", class_="main-content") is not None or
|
||||||
soup.find("div", class_="article") is not None or
|
soup.find("div", class_="article") is not None or
|
||||||
|
soup.find("div", class_="article-body") is not None or
|
||||||
("/news/" in path) or
|
("/news/" in path) or
|
||||||
("/article/" in path) or
|
("/article/" in path) or
|
||||||
(path.startswith("/detail/") and len(path) > 10)
|
(path.startswith("/detail/") and len(path) > 10) or
|
||||||
|
("ARTI" in path) # 央视网文章URL特征
|
||||||
)
|
)
|
||||||
elif "求是" in website.name:
|
elif "求是" in website.name:
|
||||||
# 求是网站的文章页面判断逻辑 - 修复两个标题问题
|
# 求是网站的文章页面判断逻辑 - 修复两个标题问题
|
||||||
@@ -950,22 +1367,31 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
|||||||
(path.startswith("/detail/") and len(path) > 10)
|
(path.startswith("/detail/") and len(path) > 10)
|
||||||
)
|
)
|
||||||
elif "解放军报" in website.name or "81.cn" in website.name:
|
elif "解放军报" in website.name or "81.cn" in website.name:
|
||||||
# 解放军报的文章页面判断逻辑 - 修复类别爬取问题
|
# 解放军报的文章页面判断逻辑 - 修复有视频的文章不被爬取问题
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
path = parsed_url.path
|
path = parsed_url.path
|
||||||
is_article_page = (
|
is_article_page = (
|
||||||
|
(soup.find("div", class_="artichle-info") is not None and
|
||||||
|
soup.find("title") is not None) or # 解放军报特有内容容器
|
||||||
|
(soup.find("div", class_="article-content") is not None and
|
||||||
|
soup.find("h1") is not None) or
|
||||||
(soup.find("div", class_="content") is not None and
|
(soup.find("div", class_="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="article-content") is not None or
|
|
||||||
(soup.find("div", id="content") is not None and
|
(soup.find("div", id="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="text") is not None or
|
soup.find("div", class_="text") is not None or
|
||||||
soup.find("div", class_="main-content") is not None or
|
soup.find("div", class_="main-content") is not None or
|
||||||
soup.find("div", class_="article") is not None or
|
soup.find("div", class_="article") is not None or
|
||||||
|
soup.find("div", class_="article-body") is not None or
|
||||||
|
("/zt/" in path) or # 解放军报专题栏目
|
||||||
("/article/" in path) or
|
("/article/" in path) or
|
||||||
("/content/" in path) or
|
("/content/" in path) or
|
||||||
(path.startswith("/detail/") and len(path) > 10)
|
(path.startswith("/detail/") and len(path) > 10)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 排除列表页面
|
||||||
|
if "/index.html" in path or path.endswith("/"):
|
||||||
|
is_article_page = False
|
||||||
elif "光明日报" in website.name or "gmw.cn" in website.name:
|
elif "光明日报" in website.name or "gmw.cn" in website.name:
|
||||||
# 光明日报的文章页面判断逻辑 - 修复不保存文章内容问题
|
# 光明日报的文章页面判断逻辑 - 修复不保存文章内容问题
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
@@ -1022,19 +1448,23 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
|||||||
(path.startswith("/detail/") and len(path) > 10)
|
(path.startswith("/detail/") and len(path) > 10)
|
||||||
)
|
)
|
||||||
elif "工人日报" in website.name or "workercn" in website.name:
|
elif "工人日报" in website.name or "workercn" in website.name:
|
||||||
# 工人日报的文章页面判断逻辑 - 修复不保存文章内容问题
|
# 工人日报的文章页面判断逻辑 - 修复不保存正文和图片问题
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
path = parsed_url.path
|
path = parsed_url.path
|
||||||
is_article_page = (
|
is_article_page = (
|
||||||
|
(soup.find("div", class_="ccontent") is not None and
|
||||||
|
soup.find("h1") is not None) or # 工人日报特有内容容器
|
||||||
|
(soup.find("div", class_="article-content") is not None and
|
||||||
|
soup.find("h1") is not None) or
|
||||||
(soup.find("div", class_="content") is not None and
|
(soup.find("div", class_="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="article-content") is not None or
|
|
||||||
(soup.find("div", id="content") is not None and
|
(soup.find("div", id="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="text") is not None or
|
soup.find("div", class_="text") is not None or
|
||||||
soup.find("div", class_="main-content") is not None or
|
soup.find("div", class_="main-content") is not None or
|
||||||
soup.find("div", class_="article") is not None or
|
soup.find("div", class_="article") is not None or
|
||||||
soup.find("div", class_="article-body") is not None or
|
soup.find("div", class_="article-body") is not None or
|
||||||
|
("/c/" in path) or # 工人日报文章URL特征
|
||||||
("/article/" in path) or
|
("/article/" in path) or
|
||||||
("/content/" in path) or
|
("/content/" in path) or
|
||||||
(path.startswith("/detail/") and len(path) > 10)
|
(path.startswith("/detail/") and len(path) > 10)
|
||||||
@@ -1168,13 +1598,16 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
|||||||
(path.startswith("/detail/") and len(path) > 10)
|
(path.startswith("/detail/") and len(path) > 10)
|
||||||
)
|
)
|
||||||
elif "法治日报" in website.name or "legaldaily" in website.name:
|
elif "法治日报" in website.name or "legaldaily" in website.name:
|
||||||
# 法治日报的文章页面判断逻辑 - 修复无法爬取问题
|
# 法治日报的文章页面判断逻辑 - 修复不保存正文和图片问题
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
path = parsed_url.path
|
path = parsed_url.path
|
||||||
is_article_page = (
|
is_article_page = (
|
||||||
|
(soup.find("div", class_="content-two") is not None and
|
||||||
|
soup.find("h1") is not None) or # 法治日报特有内容容器
|
||||||
|
(soup.find("div", class_="article-content") is not None and
|
||||||
|
soup.find("h1") is not None) or
|
||||||
(soup.find("div", class_="content") is not None and
|
(soup.find("div", class_="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="article-content") is not None or
|
|
||||||
(soup.find("div", id="content") is not None and
|
(soup.find("div", id="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="text") is not None or
|
soup.find("div", class_="text") is not None or
|
||||||
@@ -1187,13 +1620,16 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
|||||||
(path.startswith("/detail/") and len(path) > 10)
|
(path.startswith("/detail/") and len(path) > 10)
|
||||||
)
|
)
|
||||||
elif "农民日报" in website.name or "farmer" in website.name:
|
elif "农民日报" in website.name or "farmer" in website.name:
|
||||||
# 农民日报的文章页面判断逻辑 - 修复正文未被爬取问题
|
# 农民日报的文章页面判断逻辑 - 修复不保存正文和图片问题
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
path = parsed_url.path
|
path = parsed_url.path
|
||||||
is_article_page = (
|
is_article_page = (
|
||||||
|
(soup.find("div", class_="detailCon") is not None and
|
||||||
|
soup.find("h1") is not None) or # 农民日报特有内容容器
|
||||||
|
(soup.find("div", class_="article-content") is not None and
|
||||||
|
soup.find("h1") is not None) or
|
||||||
(soup.find("div", class_="content") is not None and
|
(soup.find("div", class_="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="article-content") is not None or
|
|
||||||
(soup.find("div", id="content") is not None and
|
(soup.find("div", id="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="text") is not None or
|
soup.find("div", class_="text") is not None or
|
||||||
@@ -1223,30 +1659,39 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
|||||||
(path.startswith("/detail/") and len(path) > 10)
|
(path.startswith("/detail/") and len(path) > 10)
|
||||||
)
|
)
|
||||||
elif "旗帜网" in website.name or "qizhiwang" in website.name:
|
elif "旗帜网" in website.name or "qizhiwang" in website.name:
|
||||||
# 旗帜网的文章页面判断逻辑 - 修复不保存文章内容问题
|
# 旗帜网的文章页面判断逻辑 - 修复不保存正文和图片问题
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
path = parsed_url.path
|
path = parsed_url.path
|
||||||
is_article_page = (
|
is_article_page = (
|
||||||
|
(soup.find("div", class_="w1200 flag-text-con clearfix") is not None and
|
||||||
|
soup.find("h1") is not None) or # 旗帜网特有内容容器
|
||||||
|
(soup.find("div", class_="article-content") is not None and
|
||||||
|
soup.find("h1") is not None) or
|
||||||
(soup.find("div", class_="content") is not None and
|
(soup.find("div", class_="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="article-content") is not None or
|
|
||||||
(soup.find("div", id="content") is not None and
|
(soup.find("div", id="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="main-content") is not None or
|
soup.find("div", class_="main-content") is not None or
|
||||||
soup.find("div", class_="article") is not None or
|
soup.find("div", class_="article") is not None or
|
||||||
soup.find("div", class_="article-body") is not None or
|
soup.find("div", class_="article-body") is not None or
|
||||||
|
("/n1/" in path) or # 旗帜网文章URL特征
|
||||||
("/article/" in path) or
|
("/article/" in path) or
|
||||||
("/content/" in path) or
|
("/content/" in path) or
|
||||||
(path.startswith("/detail/") and len(path) > 10)
|
(path.startswith("/detail/") and len(path) > 10) or
|
||||||
|
# 简化判断:只要有h1标题就认为是文章页面
|
||||||
|
soup.find("h1") is not None
|
||||||
)
|
)
|
||||||
elif "中国网" in website.name or "china.com.cn" in website.name:
|
elif "中国网" in website.name or "china.com.cn" in website.name:
|
||||||
# 中国网的文章页面判断逻辑 - 修复不保存文章内容问题
|
# 中国网的文章页面判断逻辑 - 修复不保存正文和图片问题
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
path = parsed_url.path
|
path = parsed_url.path
|
||||||
is_article_page = (
|
is_article_page = (
|
||||||
|
(soup.find("div", class_="main") is not None and
|
||||||
|
soup.find("h1") is not None) or # 中国网特有内容容器
|
||||||
|
(soup.find("div", class_="article-content") is not None and
|
||||||
|
soup.find("h1") is not None) or
|
||||||
(soup.find("div", class_="content") is not None and
|
(soup.find("div", class_="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="article-content") is not None or
|
|
||||||
(soup.find("div", id="content") is not None and
|
(soup.find("div", id="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="text") is not None or
|
soup.find("div", class_="text") is not None or
|
||||||
@@ -1255,6 +1700,7 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
|||||||
soup.find("div", class_="article-body") is not None or
|
soup.find("div", class_="article-body") is not None or
|
||||||
("/article/" in path) or
|
("/article/" in path) or
|
||||||
("/content/" in path) or
|
("/content/" in path) or
|
||||||
|
("/opinion/" in path) or # 中国网观点栏目
|
||||||
(path.startswith("/detail/") and len(path) > 10)
|
(path.startswith("/detail/") and len(path) > 10)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user