deploy test
This commit is contained in:
@@ -9,7 +9,7 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['cctv', 'cctvnews', 'all'],
|
||||
choices=['cctvnews', 'all'],
|
||||
help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
@@ -4,12 +4,12 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中国网主网及中国网一省份,不转发二级子网站"
|
||||
help = "全站递归爬取 中国网主网"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['china', 'all'],
|
||||
help='选择爬取平台: china(中国网主网), province(中国网一省份), all(全部)')
|
||||
help='选择爬取平台: china(中国网主网), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
@@ -4,11 +4,11 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中国日报及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 中国日报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['chinadaily', 'mobile', 'all'],
|
||||
choices=['chinadaily','all'],
|
||||
help='选择爬取平台: chinadaily(中国日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中国新闻社及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 中国新闻社平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 法治日报及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 法治日报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -5,7 +5,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
# jimmy.fang-20250815: 取消对光明日报的支持,光明日报反爬,被阻挡
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 光明日报及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 光明日报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 工人日报及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 工人日报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 经济日报及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 经济日报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -5,7 +5,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 科技日报及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 科技日报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 解放军报及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 解放军报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 求是杂志及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 求是杂志平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 旗帜网及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 旗帜网平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 人民政协网及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 人民政协网平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 新华社及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 新华社平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 学习强国中央媒体学习号及省级以上学习平台"
|
||||
help = "全站递归爬取 学习强国平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 学习时报及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 学习时报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中国妇女报及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 中国妇女报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中国纪检监察报及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 中国纪检监察报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中国青年报及其子网站、客户端、新媒体平台"
|
||||
help = "全站递归爬取 中国青年报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
|
||||
425
core/utils.py
425
core/utils.py
@@ -54,6 +54,28 @@ def get_page_with_selenium(url, website_name):
|
||||
wait_time = 15 # 学习强国需要更长时间
|
||||
elif "法治日报" in website_name:
|
||||
wait_time = 12 # 法治日报需要较长时间
|
||||
elif "中国新闻社" in website_name or "chinanews" in website_name:
|
||||
wait_time = 12 # 中国新闻社需要较长时间
|
||||
elif "中国政府网" in website_name or "gov.cn" in website_name:
|
||||
wait_time = 12 # 中国政府网需要较长时间
|
||||
elif "工人日报" in website_name or "workercn" in website_name:
|
||||
wait_time = 12 # 工人日报需要较长时间
|
||||
elif "经济日报" in website_name or "ce.cn" in website_name:
|
||||
wait_time = 12 # 经济日报需要较长时间
|
||||
elif "求是" in website_name or "qstheory" in website_name:
|
||||
wait_time = 12 # 求是网需要较长时间
|
||||
elif "旗帜网" in website_name or "qizhiwang" in website_name:
|
||||
wait_time = 12 # 旗帜网需要较长时间
|
||||
elif "人民日报" in website_name or "people" in website_name:
|
||||
wait_time = 12 # 人民日报需要较长时间
|
||||
elif "人民政协网" in website_name or "rmzxw" in website_name:
|
||||
wait_time = 12 # 人民政协网需要较长时间
|
||||
elif "学习时报" in website_name or "studytimes" in website_name:
|
||||
wait_time = 12 # 学习时报需要较长时间
|
||||
elif "中国妇女报" in website_name or "cnwomen" in website_name:
|
||||
wait_time = 12 # 中国妇女报需要较长时间
|
||||
elif "中国青年报" in website_name or "cyol" in website_name:
|
||||
wait_time = 12 # 中国青年报需要较长时间
|
||||
|
||||
# 等待页面主要内容加载
|
||||
try:
|
||||
@@ -234,10 +256,90 @@ def process_article(url, website):
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Referer": "http://www.qizhiwang.org.cn/"
|
||||
})
|
||||
# 添加中国新闻社的特殊请求头
|
||||
elif "中国新闻社" in website.name or "chinanews" in website.name:
|
||||
headers.update({
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Referer": "https://www.chinanews.com.cn/"
|
||||
})
|
||||
# 添加中国政府网的特殊请求头
|
||||
elif "中国政府网" in website.name or "gov.cn" in website.name:
|
||||
headers.update({
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Referer": "https://www.gov.cn/"
|
||||
})
|
||||
# 添加经济日报的特殊请求头
|
||||
elif "经济日报" in website.name or "ce.cn" in website.name:
|
||||
headers.update({
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Referer": "http://www.ce.cn/"
|
||||
})
|
||||
# 添加求是网的特殊请求头
|
||||
elif "求是" in website.name or "qstheory" in website.name:
|
||||
headers.update({
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Referer": "http://www.qstheory.cn/"
|
||||
})
|
||||
# 添加人民政协网的特殊请求头
|
||||
elif "人民政协网" in website.name or "rmzxw" in website.name:
|
||||
headers.update({
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Referer": "https://www.rmzxw.com.cn/"
|
||||
})
|
||||
# 添加学习时报的特殊请求头
|
||||
elif "学习时报" in website.name or "studytimes" in website.name:
|
||||
headers.update({
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Referer": "https://www.studytimes.cn/"
|
||||
})
|
||||
# 添加中国妇女报的特殊请求头
|
||||
elif "中国妇女报" in website.name or "cnwomen" in website.name:
|
||||
headers.update({
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Referer": "https://www.cnwomen.com.cn/"
|
||||
})
|
||||
# 添加中国青年报的特殊请求头
|
||||
elif "中国青年报" in website.name or "cyol" in website.name:
|
||||
headers.update({
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Referer": "http://news.cyol.com/"
|
||||
})
|
||||
|
||||
# 判断是否需要使用Selenium
|
||||
need_selenium = False
|
||||
if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]):
|
||||
if any(name in website.name for name in ["中国妇女报", "cnwomen", "中国纪检监察报", "jjjcb", "中国青年报", "cyol"]):
|
||||
need_selenium = True
|
||||
|
||||
try:
|
||||
@@ -249,7 +351,7 @@ def process_article(url, website):
|
||||
return
|
||||
|
||||
# 检查页面内容是否过短
|
||||
min_length = 100 if "法治日报" in website.name else 300
|
||||
min_length = 200
|
||||
if len(page_source) < min_length:
|
||||
print(f"页面内容过短,可能是重定向页面:{url}")
|
||||
return
|
||||
@@ -266,6 +368,40 @@ def process_article(url, website):
|
||||
print(f"页面内容过短,可能是重定向页面:{url}")
|
||||
return
|
||||
|
||||
# 针对不同网站设置正确的编码
|
||||
if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "中国网" in website.name or "china.com.cn" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "中国新闻社" in website.name or "chinanews" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "中国政府网" in website.name or "gov.cn" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "工人日报" in website.name or "workercn" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "经济日报" in website.name or "ce.cn" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "求是" in website.name or "qstheory" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "旗帜网" in website.name or "qizhiwang" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "人民日报" in website.name or "people" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "人民政协网" in website.name or "rmzxw" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "学习时报" in website.name or "studytimes" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "中国妇女报" in website.name or "cnwomen" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "中国青年报" in website.name or "cyol" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "学习强国" in website.name or "xuexi" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "法治日报" in website.name or "legaldaily" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
else:
|
||||
resp.encoding = 'utf-8'
|
||||
|
||||
# 创建BeautifulSoup对象
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
@@ -274,29 +410,32 @@ def process_article(url, website):
|
||||
return
|
||||
|
||||
# 针对不同网站设置正确的编码(仅对requests获取的内容)
|
||||
if not need_selenium:
|
||||
if "人民网" in website.name or "人民日报" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "新华网" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "央视" in website.name or "CCTV" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "农民日报" in website.name or "farmer" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
# 尝试其他编码
|
||||
if '' in resp.text or len(resp.text) < 1000:
|
||||
resp.encoding = 'gbk'
|
||||
# 进一步尝试其他编码
|
||||
if '' in resp.text or '' in resp.text:
|
||||
resp.encoding = 'gb2312'
|
||||
# 如果还是有问题,尝试更多编码
|
||||
if '' in resp.text or '' in resp.text:
|
||||
resp.encoding = 'utf-8-sig'
|
||||
# 最后尝试
|
||||
if '' in resp.text or '' in resp.text:
|
||||
resp.encoding = 'big5'
|
||||
else:
|
||||
resp.encoding = 'utf-8'
|
||||
# 注释掉原有的编码处理逻辑,统一使用UTF-8
|
||||
# if not need_selenium:
|
||||
# if "人民网" in website.name or "人民日报" in website.name:
|
||||
# resp.encoding = 'utf-8'
|
||||
# elif "新华网" in website.name:
|
||||
# resp.encoding = 'utf-8'
|
||||
# elif "农民日报" in website.name or "farmer" in website.name:
|
||||
# resp.encoding = 'utf-8'
|
||||
# # 尝试其他编码
|
||||
# if '锘' in resp.text or len(resp.text) < 1000:
|
||||
# resp.encoding = 'gbk'
|
||||
# # 进一步尝试其他编码
|
||||
# if '锘' in resp.text or '锘' in resp.text:
|
||||
# resp.encoding = 'gb2312'
|
||||
# # 如果还是有问题,尝试更多编码
|
||||
# if '锘' in resp.text or '锘' in resp.text:
|
||||
# resp.encoding = 'utf-8-sig'
|
||||
# # 最后尝试
|
||||
# if '锘' in resp.text or '锘' in resp.text:
|
||||
# resp.encoding = 'big5'
|
||||
# else:
|
||||
# resp.encoding = 'utf-8'
|
||||
|
||||
# 统一设置编码为UTF-8,解决乱码问题
|
||||
#if not need_selenium:
|
||||
# resp.encoding = 'utf-8'
|
||||
|
||||
# 处理不同网站的文章结构
|
||||
if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name:
|
||||
@@ -363,6 +502,16 @@ def process_article(url, website):
|
||||
if title_text in title_element_text or title_element_text in title_text:
|
||||
title_element.decompose()
|
||||
|
||||
# 移除id为ti的元素(中国政府网特有的标题元素)
|
||||
for ti_element in content_tag.find_all(id="ti"):
|
||||
ti_element.decompose()
|
||||
|
||||
# 移除包含"简历"等关键词的重复标题
|
||||
for element in content_tag.find_all(["h1", "h2", "h3", "strong", "b"]):
|
||||
element_text = element.get_text(strip=True)
|
||||
if "简历" in element_text and len(element_text) < 20:
|
||||
element.decompose()
|
||||
|
||||
# 移除编辑信息
|
||||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||||
editor_element.decompose()
|
||||
@@ -394,7 +543,19 @@ def process_article(url, website):
|
||||
soup.find("section", class_="content") or
|
||||
soup.find("div", class_="article") or
|
||||
soup.find("div", class_="rm_txt_con") or # 添加人民网特有的内容容器
|
||||
soup.find("div", class_="text_c") # 添加新的内容容器
|
||||
soup.find("div", class_="text_c") or # 添加新的内容容器
|
||||
soup.find("div", class_="article-detail") or # 人民日报文章详情容器
|
||||
soup.find("div", class_="detail-content") or # 人民日报详情内容容器
|
||||
soup.find("div", class_="article-text") or # 人民日报文章文本容器
|
||||
soup.find("div", class_="content-text") or # 人民日报内容文本容器
|
||||
soup.find("div", class_="news-content") or # 人民日报新闻内容容器
|
||||
soup.find("div", class_="news-text") or # 人民日报新闻文本容器
|
||||
soup.find("div", class_="news-detail") or # 人民日报新闻详情容器
|
||||
soup.find("div", class_="article-main") or # 人民日报文章主体容器
|
||||
soup.find("div", class_="article-container") or # 人民日报文章容器
|
||||
soup.find("div", class_="content-container") or # 人民日报内容容器
|
||||
soup.find("div", class_="text-container") or # 人民日报文本容器
|
||||
soup.find("div", class_="main-container") # 人民日报主体容器
|
||||
)
|
||||
|
||||
# 针对人民网的特殊处理,清理内容中的无关元素
|
||||
@@ -454,7 +615,19 @@ def process_article(url, website):
|
||||
soup.find("div", class_="text") or
|
||||
soup.find("div", class_="main-content") or
|
||||
soup.find("div", class_="article") or
|
||||
soup.find("div", class_="article-body")
|
||||
soup.find("div", class_="article-body") or
|
||||
soup.find("div", class_="article-detail") or # 央视网文章详情容器
|
||||
soup.find("div", class_="detail-content") or # 央视网详情内容容器
|
||||
soup.find("div", class_="article-text") or # 央视网文章文本容器
|
||||
soup.find("div", class_="content-text") or # 央视网内容文本容器
|
||||
soup.find("div", class_="news-content") or # 央视网新闻内容容器
|
||||
soup.find("div", class_="news-text") or # 央视网新闻文本容器
|
||||
soup.find("div", class_="news-detail") or # 央视网新闻详情容器
|
||||
soup.find("div", class_="article-main") or # 央视网文章主体容器
|
||||
soup.find("div", class_="article-container") or # 央视网文章容器
|
||||
soup.find("div", class_="content-container") or # 央视网内容容器
|
||||
soup.find("div", class_="text-container") or # 央视网文本容器
|
||||
soup.find("div", class_="main-container") # 央视网主体容器
|
||||
)
|
||||
|
||||
# 针对央视网的特殊处理,清理内容中的无关元素
|
||||
@@ -503,6 +676,17 @@ def process_article(url, website):
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
# 针对求是的特殊处理,如果标题包含"海报"等关键词,尝试从内容中提取更好的标题
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
if "海报" in title_text or "图" in title_text:
|
||||
# 尝试从内容中查找更好的标题
|
||||
content_h1 = soup.find("h1")
|
||||
if content_h1 and content_h1 != title_tag:
|
||||
content_title = content_h1.get_text(strip=True)
|
||||
if len(content_title) > len(title_text) and "海报" not in content_title:
|
||||
title_tag = content_h1
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="content") or
|
||||
soup.find("div", class_="article-content") or
|
||||
@@ -819,14 +1003,41 @@ def process_article(url, website):
|
||||
# 特殊处理人民政协网的标题结构
|
||||
if title_tag and title_tag.find("span", id="a"):
|
||||
title_tag = title_tag.find("span", id="a")
|
||||
elif title_tag and title_tag.get_text(strip=True) == "首页>聚焦":
|
||||
elif title_tag and (title_tag.get_text(strip=True) == "首页>聚焦" or title_tag.get_text(strip=True) == "首页 > 聚焦"):
|
||||
# 如果标题还是"首页>聚焦",尝试从内容中提取标题
|
||||
if content_tag:
|
||||
first_p = content_tag.find("p")
|
||||
# 查找文章正文中的第一个strong标签作为标题
|
||||
content_div = soup.find("div", class_="text_box")
|
||||
if content_div:
|
||||
first_p = content_div.find("p")
|
||||
if first_p and first_p.find("strong"):
|
||||
title_text = first_p.find("strong").get_text().strip()
|
||||
# 创建一个虚拟的title_tag对象
|
||||
title_tag = first_p.find("strong")
|
||||
else:
|
||||
# 如果没有找到strong标签,尝试查找内容中的第一个h2标签
|
||||
first_h2 = content_div.find("h2")
|
||||
if first_h2:
|
||||
title_tag = first_h2
|
||||
|
||||
# 针对人民政协网的特殊处理,如果标题包含"首页>聚焦",尝试从页面中查找更好的标题
|
||||
if title_tag and ("首页>聚焦" in title_tag.get_text(strip=True) or "首页 > 聚焦" in title_tag.get_text(strip=True)):
|
||||
# 尝试从页面中查找其他可能的标题
|
||||
for h in soup.find_all(["h1", "h2", "h3"]):
|
||||
h_text = h.get_text(strip=True)
|
||||
if h_text and "首页>聚焦" not in h_text and "首页 > 聚焦" not in h_text and len(h_text) > 5:
|
||||
title_tag = h
|
||||
break
|
||||
|
||||
# 如果还是没找到,尝试从title标签提取
|
||||
if "首页>聚焦" in title_tag.get_text(strip=True) or "首页 > 聚焦" in title_tag.get_text(strip=True):
|
||||
page_title = soup.find("title")
|
||||
if page_title:
|
||||
title_text = page_title.get_text(strip=True)
|
||||
# 移除网站名称等后缀信息
|
||||
if " - 人民政协网" in title_text:
|
||||
title_text = title_text.split(" - 人民政协网")[0]
|
||||
if "首页>聚焦" not in title_text and "首页 > 聚焦" not in title_text and len(title_text) > 5:
|
||||
title_tag = page_title
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="content") or
|
||||
@@ -972,7 +1183,14 @@ def process_article(url, website):
|
||||
soup.find("div", class_="text") or
|
||||
soup.find("div", class_="main-content") or
|
||||
soup.find("div", class_="article") or
|
||||
soup.find("div", class_="article-body")
|
||||
soup.find("div", class_="article-body") or
|
||||
soup.find("div", class_="article-detail") or # 中国青年报文章详情容器
|
||||
soup.find("div", class_="detail-content") or # 中国青年报详情内容容器
|
||||
soup.find("div", class_="article-text") or # 中国青年报文章文本容器
|
||||
soup.find("div", class_="content-text") or # 中国青年报内容文本容器
|
||||
soup.find("div", class_="news-content") or # 中国青年报新闻内容容器
|
||||
soup.find("div", class_="news-text") or # 中国青年报新闻文本容器
|
||||
soup.find("div", class_="news-detail") # 中国青年报新闻详情容器
|
||||
)
|
||||
elif "中国妇女报" in website.name or "cnwomen" in website.name:
|
||||
# 中国妇女报的文章结构处理 - 修复不保存文章内容问题
|
||||
@@ -987,6 +1205,9 @@ def process_article(url, website):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="f_container") or # 中国妇女报特有内容容器
|
||||
soup.find("div", class_="f_container_left") or # 中国妇女报特有内容容器
|
||||
soup.find("div", class_="f_navigation_bars") or # 中国妇女报特有内容容器
|
||||
soup.find("div", class_="main") or # 中国妇女报特有内容容器
|
||||
soup.find("div", class_="news") or # 中国妇女报特有内容容器
|
||||
soup.find("div", class_="content") or
|
||||
@@ -995,7 +1216,19 @@ def process_article(url, website):
|
||||
soup.find("div", class_="text") or
|
||||
soup.find("div", class_="main-content") or
|
||||
soup.find("div", class_="article") or
|
||||
soup.find("div", class_="article-body")
|
||||
soup.find("div", class_="article-body") or
|
||||
soup.find("div", class_="article-detail") or # 中国妇女报文章详情容器
|
||||
soup.find("div", class_="detail-content") or # 中国妇女报详情内容容器
|
||||
soup.find("div", class_="article-text") or # 中国妇女报文章文本容器
|
||||
soup.find("div", class_="content-text") or # 中国妇女报内容文本容器
|
||||
soup.find("div", class_="news-content") or # 中国妇女报新闻内容容器
|
||||
soup.find("div", class_="news-text") or # 中国妇女报新闻文本容器
|
||||
soup.find("div", class_="news-detail") or # 中国妇女报新闻详情容器
|
||||
soup.find("div", class_="article-main") or # 中国妇女报文章主体容器
|
||||
soup.find("div", class_="article-container") or # 中国妇女报文章容器
|
||||
soup.find("div", class_="content-container") or # 中国妇女报内容容器
|
||||
soup.find("div", class_="text-container") or # 中国妇女报文本容器
|
||||
soup.find("div", class_="main-container") # 中国妇女报主体容器
|
||||
)
|
||||
elif "法治日报" in website.name or "legaldaily" in website.name:
|
||||
# 法治日报的文章结构处理 - 修复不保存正文和图片问题
|
||||
@@ -1080,7 +1313,19 @@ def process_article(url, website):
|
||||
soup.find("div", class_="text") or
|
||||
soup.find("div", class_="main-content") or
|
||||
soup.find("div", class_="article") or
|
||||
soup.find("div", class_="article-body")
|
||||
soup.find("div", class_="article-body") or
|
||||
soup.find("div", class_="article-detail") or # 农民日报文章详情容器
|
||||
soup.find("div", class_="detail-content") or # 农民日报详情内容容器
|
||||
soup.find("div", class_="article-text") or # 农民日报文章文本容器
|
||||
soup.find("div", class_="content-text") or # 农民日报内容文本容器
|
||||
soup.find("div", class_="news-content") or # 农民日报新闻内容容器
|
||||
soup.find("div", class_="news-text") or # 农民日报新闻文本容器
|
||||
soup.find("div", class_="news-detail") or # 农民日报新闻详情容器
|
||||
soup.find("div", class_="article-main") or # 农民日报文章主体容器
|
||||
soup.find("div", class_="article-container") or # 农民日报文章容器
|
||||
soup.find("div", class_="content-container") or # 农民日报内容容器
|
||||
soup.find("div", class_="text-container") or # 农民日报文本容器
|
||||
soup.find("div", class_="main-container") # 农民日报主体容器
|
||||
)
|
||||
|
||||
# 针对农民日报的特殊处理,如果找到多个detailCon,选择内容最长的那个
|
||||
@@ -1253,11 +1498,29 @@ def process_article(url, website):
|
||||
soup.find("div", id="content") or
|
||||
soup.find("div", class_="text") or
|
||||
soup.find("div", class_="main-content") or
|
||||
soup.find("div", class_="article-body")
|
||||
soup.find("div", class_="article-body") or
|
||||
soup.find("div", class_="news-content") or # 中国网新闻内容容器
|
||||
soup.find("div", class_="news-text") or # 中国网新闻文本容器
|
||||
soup.find("div", class_="news-detail") or # 中国网新闻详情容器
|
||||
soup.find("div", class_="detail-content") or # 中国网详情内容容器
|
||||
soup.find("div", class_="article-text") or # 中国网文章文本容器
|
||||
soup.find("div", class_="content-text") # 中国网内容文本容器
|
||||
)
|
||||
|
||||
# 针对中国网的特殊处理,清理内容中的无关元素
|
||||
if content_tag:
|
||||
# 检查内容质量,过滤掉纯文本内容
|
||||
content_text = content_tag.get_text(strip=True)
|
||||
if len(content_text) < 100: # 如果内容太短,可能是纯文本
|
||||
print(f"中国网内容过短,可能是纯文本,跳过: {url}")
|
||||
return
|
||||
|
||||
# 检查是否包含足够的HTML标签(图片、链接等)
|
||||
html_elements = content_tag.find_all(["img", "a", "p", "div", "span"])
|
||||
if len(html_elements) < 3: # 如果HTML元素太少,可能是纯文本
|
||||
print(f"中国网内容HTML元素过少,可能是纯文本,跳过: {url}")
|
||||
return
|
||||
|
||||
# 移除编辑信息
|
||||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||||
editor_element.decompose()
|
||||
@@ -1355,6 +1618,71 @@ def process_article(url, website):
|
||||
src = value
|
||||
break
|
||||
|
||||
# 新增:查找新华网特有的视频播放器结构
|
||||
if not src:
|
||||
# 查找包含视频信息的script标签
|
||||
for script in soup.find_all("script"):
|
||||
if script.string and "video" in script.string.lower():
|
||||
# 尝试从script中提取视频URL
|
||||
import re
|
||||
video_patterns = [
|
||||
r'https?://[^\s"\']+\.(?:mp4|flv|avi|mov|wmv)',
|
||||
r'https?://[^\s"\']+video[^\s"\']*',
|
||||
r'https?://[^\s"\']+media[^\s"\']*'
|
||||
]
|
||||
for pattern in video_patterns:
|
||||
matches = re.findall(pattern, script.string)
|
||||
if matches:
|
||||
src = matches[0]
|
||||
break
|
||||
if src:
|
||||
break
|
||||
|
||||
# 新增:查找新华网特有的iframe视频播放器
|
||||
if not src:
|
||||
iframe = soup.find("iframe", src=lambda x: x and ("video" in x or "player" in x))
|
||||
if iframe:
|
||||
src = iframe.get("src")
|
||||
|
||||
# 新增:查找新华网特有的视频播放器容器
|
||||
if not src:
|
||||
video_container = soup.find("div", class_="video-container") or soup.find("div", class_="player-container")
|
||||
if video_container:
|
||||
# 在容器中查找视频元素
|
||||
video_elem = video_container.find("video")
|
||||
if video_elem:
|
||||
src = video_elem.get("src") or video_elem.get("data-src")
|
||||
|
||||
# 如果没有找到video标签,查找source标签
|
||||
if not src:
|
||||
source_elem = video_container.find("source")
|
||||
if source_elem:
|
||||
src = source_elem.get("src") or source_elem.get("data-src")
|
||||
|
||||
# 新增:查找新华网特有的视频链接
|
||||
if not src:
|
||||
video_links = soup.find_all("a", href=lambda x: x and ("video" in x or "media" in x))
|
||||
for link in video_links:
|
||||
href = link.get("href")
|
||||
if href and (".mp4" in href or ".flv" in href or "video" in href):
|
||||
src = href
|
||||
break
|
||||
|
||||
# 新增:直接从video标签的属性中获取src(处理新华网视频)
|
||||
if not src and video.get("src"):
|
||||
src = video.get("src")
|
||||
|
||||
# 新增:处理新华网视频,从示例代码中提取src
|
||||
if not src and "新华网" in website.name:
|
||||
# 直接从video标签中获取src属性
|
||||
if video.has_attr('src'):
|
||||
src = video.get('src')
|
||||
# 检查是否有完整的属性列表
|
||||
for attr in video.attrs:
|
||||
if isinstance(video.attrs[attr], str) and ('.mp4' in video.attrs[attr] or 'vodpub' in video.attrs[attr]):
|
||||
src = video.attrs[attr]
|
||||
break
|
||||
|
||||
if not src:
|
||||
continue
|
||||
|
||||
@@ -1449,7 +1777,36 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
||||
print(f"请求失败:{url},错误:{e}")
|
||||
continue
|
||||
|
||||
# 针对不同网站设置正确的编码
|
||||
if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "中国网" in website.name or "china.com.cn" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "中国新闻社" in website.name or "chinanews" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "中国政府网" in website.name or "gov.cn" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "工人日报" in website.name or "workercn" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "经济日报" in website.name or "ce.cn" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "求是" in website.name or "qstheory" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "旗帜网" in website.name or "qizhiwang" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "人民日报" in website.name or "people" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "人民政协网" in website.name or "rmzxw" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "学习时报" in website.name or "studytimes" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "中国妇女报" in website.name or "cnwomen" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
elif "中国青年报" in website.name or "cyol" in website.name:
|
||||
resp.encoding = 'utf-8'
|
||||
else:
|
||||
resp.encoding = 'utf-8'
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# 根据不同网站判断文章页面
|
||||
|
||||
Reference in New Issue
Block a user