deploy test

This commit is contained in:
2025-08-17 02:12:25 +08:00
parent 4945b4c6b0
commit 193894fcb4
20 changed files with 413 additions and 56 deletions

View File

@@ -9,7 +9,7 @@ class Command(BaseCommand):
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',
choices=['cctv', 'cctvnews', 'all'], choices=['cctvnews', 'all'],
help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), all(全部)') help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), all(全部)')
def handle(self, *args, **options): def handle(self, *args, **options):

View File

@@ -4,12 +4,12 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 中国网主网及中国网一省份,不转发二级子网站" help = "全站递归爬取 中国网主网"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',
choices=['china', 'all'], choices=['china', 'all'],
help='选择爬取平台: china(中国网主网), province(中国网一省份), all(全部)') help='选择爬取平台: china(中国网主网), all(全部)')
def handle(self, *args, **options): def handle(self, *args, **options):
platform = options['platform'] platform = options['platform']

View File

@@ -4,11 +4,11 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 中国日报及其子网站、客户端、新媒体平台" help = "全站递归爬取 中国日报平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',
choices=['chinadaily', 'mobile', 'all'], choices=['chinadaily','all'],
help='选择爬取平台: chinadaily(中国日报), all(全部)') help='选择爬取平台: chinadaily(中国日报), all(全部)')
def handle(self, *args, **options): def handle(self, *args, **options):

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 中国新闻社及其子网站、客户端、新媒体平台" help = "全站递归爬取 中国新闻社平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 法治日报及其子网站、客户端、新媒体平台" help = "全站递归爬取 法治日报平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -5,7 +5,7 @@ from core.utils import full_site_crawler
# jimmy.fang-20250815: 取消对光明日报的支持,光明日报反爬,被阻挡 # jimmy.fang-20250815: 取消对光明日报的支持,光明日报反爬,被阻挡
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 光明日报及其子网站、客户端、新媒体平台" help = "全站递归爬取 光明日报平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 工人日报及其子网站、客户端、新媒体平台" help = "全站递归爬取 工人日报平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 经济日报及其子网站、客户端、新媒体平台" help = "全站递归爬取 经济日报平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -5,7 +5,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 科技日报及其子网站、客户端、新媒体平台" help = "全站递归爬取 科技日报平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 解放军报及其子网站、客户端、新媒体平台" help = "全站递归爬取 解放军报平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 求是杂志及其子网站、客户端、新媒体平台" help = "全站递归爬取 求是杂志平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 旗帜网及其子网站、客户端、新媒体平台" help = "全站递归爬取 旗帜网平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 人民政协网及其子网站、客户端、新媒体平台" help = "全站递归爬取 人民政协网平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 新华社及其子网站、客户端、新媒体平台" help = "全站递归爬取 新华社平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 学习强国中央媒体学习号及省级以上学习平台" help = "全站递归爬取 学习强国平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 学习时报及其子网站、客户端、新媒体平台" help = "全站递归爬取 学习时报平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 中国妇女报及其子网站、客户端、新媒体平台" help = "全站递归爬取 中国妇女报平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 中国纪检监察报及其子网站、客户端、新媒体平台" help = "全站递归爬取 中国纪检监察报平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 中国青年报及其子网站、客户端、新媒体平台" help = "全站递归爬取 中国青年报平台"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',

View File

@@ -54,6 +54,28 @@ def get_page_with_selenium(url, website_name):
wait_time = 15 # 学习强国需要更长时间 wait_time = 15 # 学习强国需要更长时间
elif "法治日报" in website_name: elif "法治日报" in website_name:
wait_time = 12 # 法治日报需要较长时间 wait_time = 12 # 法治日报需要较长时间
elif "中国新闻社" in website_name or "chinanews" in website_name:
wait_time = 12 # 中国新闻社需要较长时间
elif "中国政府网" in website_name or "gov.cn" in website_name:
wait_time = 12 # 中国政府网需要较长时间
elif "工人日报" in website_name or "workercn" in website_name:
wait_time = 12 # 工人日报需要较长时间
elif "经济日报" in website_name or "ce.cn" in website_name:
wait_time = 12 # 经济日报需要较长时间
elif "求是" in website_name or "qstheory" in website_name:
wait_time = 12 # 求是网需要较长时间
elif "旗帜网" in website_name or "qizhiwang" in website_name:
wait_time = 12 # 旗帜网需要较长时间
elif "人民日报" in website_name or "people" in website_name:
wait_time = 12 # 人民日报需要较长时间
elif "人民政协网" in website_name or "rmzxw" in website_name:
wait_time = 12 # 人民政协网需要较长时间
elif "学习时报" in website_name or "studytimes" in website_name:
wait_time = 12 # 学习时报需要较长时间
elif "中国妇女报" in website_name or "cnwomen" in website_name:
wait_time = 12 # 中国妇女报需要较长时间
elif "中国青年报" in website_name or "cyol" in website_name:
wait_time = 12 # 中国青年报需要较长时间
# 等待页面主要内容加载 # 等待页面主要内容加载
try: try:
@@ -234,10 +256,90 @@ def process_article(url, website):
"Upgrade-Insecure-Requests": "1", "Upgrade-Insecure-Requests": "1",
"Referer": "http://www.qizhiwang.org.cn/" "Referer": "http://www.qizhiwang.org.cn/"
}) })
# 添加中国新闻社的特殊请求头
elif "中国新闻社" in website.name or "chinanews" in website.name:
headers.update({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Referer": "https://www.chinanews.com.cn/"
})
# 添加中国政府网的特殊请求头
elif "中国政府网" in website.name or "gov.cn" in website.name:
headers.update({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Referer": "https://www.gov.cn/"
})
# 添加经济日报的特殊请求头
elif "经济日报" in website.name or "ce.cn" in website.name:
headers.update({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Referer": "http://www.ce.cn/"
})
# 添加求是网的特殊请求头
elif "求是" in website.name or "qstheory" in website.name:
headers.update({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Referer": "http://www.qstheory.cn/"
})
# 添加人民政协网的特殊请求头
elif "人民政协网" in website.name or "rmzxw" in website.name:
headers.update({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Referer": "https://www.rmzxw.com.cn/"
})
# 添加学习时报的特殊请求头
elif "学习时报" in website.name or "studytimes" in website.name:
headers.update({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Referer": "https://www.studytimes.cn/"
})
# 添加中国妇女报的特殊请求头
elif "中国妇女报" in website.name or "cnwomen" in website.name:
headers.update({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Referer": "https://www.cnwomen.com.cn/"
})
# 添加中国青年报的特殊请求头
elif "中国青年报" in website.name or "cyol" in website.name:
headers.update({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Referer": "http://news.cyol.com/"
})
# 判断是否需要使用Selenium # 判断是否需要使用Selenium
need_selenium = False need_selenium = False
if any(name in website.name for name in ["学习强国", "xuexi", "法治日", "legaldaily"]): if any(name in website.name for name in ["中国妇女报", "cnwomen", "中国纪检监察", "jjjcb", "中国青年报", "cyol"]):
need_selenium = True need_selenium = True
try: try:
@@ -249,7 +351,7 @@ def process_article(url, website):
return return
# 检查页面内容是否过短 # 检查页面内容是否过短
min_length = 100 if "法治日报" in website.name else 300 min_length = 200
if len(page_source) < min_length: if len(page_source) < min_length:
print(f"页面内容过短,可能是重定向页面:{url}") print(f"页面内容过短,可能是重定向页面:{url}")
return return
@@ -266,6 +368,40 @@ def process_article(url, website):
print(f"页面内容过短,可能是重定向页面:{url}") print(f"页面内容过短,可能是重定向页面:{url}")
return return
# 针对不同网站设置正确的编码
if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
resp.encoding = 'utf-8'
elif "中国网" in website.name or "china.com.cn" in website.name:
resp.encoding = 'utf-8'
elif "中国新闻社" in website.name or "chinanews" in website.name:
resp.encoding = 'utf-8'
elif "中国政府网" in website.name or "gov.cn" in website.name:
resp.encoding = 'utf-8'
elif "工人日报" in website.name or "workercn" in website.name:
resp.encoding = 'utf-8'
elif "经济日报" in website.name or "ce.cn" in website.name:
resp.encoding = 'utf-8'
elif "求是" in website.name or "qstheory" in website.name:
resp.encoding = 'utf-8'
elif "旗帜网" in website.name or "qizhiwang" in website.name:
resp.encoding = 'utf-8'
elif "人民日报" in website.name or "people" in website.name:
resp.encoding = 'utf-8'
elif "人民政协网" in website.name or "rmzxw" in website.name:
resp.encoding = 'utf-8'
elif "学习时报" in website.name or "studytimes" in website.name:
resp.encoding = 'utf-8'
elif "中国妇女报" in website.name or "cnwomen" in website.name:
resp.encoding = 'utf-8'
elif "中国青年报" in website.name or "cyol" in website.name:
resp.encoding = 'utf-8'
elif "学习强国" in website.name or "xuexi" in website.name:
resp.encoding = 'utf-8'
elif "法治日报" in website.name or "legaldaily" in website.name:
resp.encoding = 'utf-8'
else:
resp.encoding = 'utf-8'
# 创建BeautifulSoup对象 # 创建BeautifulSoup对象
soup = BeautifulSoup(resp.text, "html.parser") soup = BeautifulSoup(resp.text, "html.parser")
@@ -274,29 +410,32 @@ def process_article(url, website):
return return
# 针对不同网站设置正确的编码仅对requests获取的内容 # 针对不同网站设置正确的编码仅对requests获取的内容
if not need_selenium: # 注释掉原有的编码处理逻辑统一使用UTF-8
if "人民网" in website.name or "人民日报" in website.name: # if not need_selenium:
resp.encoding = 'utf-8' # if "人民网" in website.name or "人民日报" in website.name:
elif "新华网" in website.name: # resp.encoding = 'utf-8'
resp.encoding = 'utf-8' # elif "新华网" in website.name:
elif "央视" in website.name or "CCTV" in website.name: # resp.encoding = 'utf-8'
resp.encoding = 'utf-8' # elif "农民日报" in website.name or "farmer" in website.name:
elif "农民日报" in website.name or "farmer" in website.name: # resp.encoding = 'utf-8'
resp.encoding = 'utf-8' # # 尝试其他编码
# 尝试其他编码 # if '锘' in resp.text or len(resp.text) < 1000:
if '' in resp.text or len(resp.text) < 1000: # resp.encoding = 'gbk'
resp.encoding = 'gbk' # # 进一步尝试其他编码
# 进一步尝试其他编码 # if '锘' in resp.text or '锘' in resp.text:
if '' in resp.text or '' in resp.text: # resp.encoding = 'gb2312'
resp.encoding = 'gb2312' # # 如果还是有问题,尝试更多编码
# 如果还是有问题,尝试更多编码 # if '锘' in resp.text or '锘' in resp.text:
if '' in resp.text or '' in resp.text: # resp.encoding = 'utf-8-sig'
resp.encoding = 'utf-8-sig' # # 最后尝试
# 最后尝试 # if '锘' in resp.text or '锘' in resp.text:
if '' in resp.text or '' in resp.text: # resp.encoding = 'big5'
resp.encoding = 'big5' # else:
else: # resp.encoding = 'utf-8'
resp.encoding = 'utf-8'
# 统一设置编码为UTF-8解决乱码问题
#if not need_selenium:
# resp.encoding = 'utf-8'
# 处理不同网站的文章结构 # 处理不同网站的文章结构
if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name: if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name:
@@ -362,6 +501,16 @@ def process_article(url, website):
title_element_text = title_element.get_text(strip=True) title_element_text = title_element.get_text(strip=True)
if title_text in title_element_text or title_element_text in title_text: if title_text in title_element_text or title_element_text in title_text:
title_element.decompose() title_element.decompose()
# 移除id为ti的元素中国政府网特有的标题元素
for ti_element in content_tag.find_all(id="ti"):
ti_element.decompose()
# 移除包含"简历"等关键词的重复标题
for element in content_tag.find_all(["h1", "h2", "h3", "strong", "b"]):
element_text = element.get_text(strip=True)
if "简历" in element_text and len(element_text) < 20:
element.decompose()
# 移除编辑信息 # 移除编辑信息
for editor_element in content_tag.find_all("div", class_="editor"): for editor_element in content_tag.find_all("div", class_="editor"):
@@ -394,7 +543,19 @@ def process_article(url, website):
soup.find("section", class_="content") or soup.find("section", class_="content") or
soup.find("div", class_="article") or soup.find("div", class_="article") or
soup.find("div", class_="rm_txt_con") or # 添加人民网特有的内容容器 soup.find("div", class_="rm_txt_con") or # 添加人民网特有的内容容器
soup.find("div", class_="text_c") # 添加新的内容容器 soup.find("div", class_="text_c") or # 添加新的内容容器
soup.find("div", class_="article-detail") or # 人民日报文章详情容器
soup.find("div", class_="detail-content") or # 人民日报详情内容容器
soup.find("div", class_="article-text") or # 人民日报文章文本容器
soup.find("div", class_="content-text") or # 人民日报内容文本容器
soup.find("div", class_="news-content") or # 人民日报新闻内容容器
soup.find("div", class_="news-text") or # 人民日报新闻文本容器
soup.find("div", class_="news-detail") or # 人民日报新闻详情容器
soup.find("div", class_="article-main") or # 人民日报文章主体容器
soup.find("div", class_="article-container") or # 人民日报文章容器
soup.find("div", class_="content-container") or # 人民日报内容容器
soup.find("div", class_="text-container") or # 人民日报文本容器
soup.find("div", class_="main-container") # 人民日报主体容器
) )
# 针对人民网的特殊处理,清理内容中的无关元素 # 针对人民网的特殊处理,清理内容中的无关元素
@@ -454,7 +615,19 @@ def process_article(url, website):
soup.find("div", class_="text") or soup.find("div", class_="text") or
soup.find("div", class_="main-content") or soup.find("div", class_="main-content") or
soup.find("div", class_="article") or soup.find("div", class_="article") or
soup.find("div", class_="article-body") soup.find("div", class_="article-body") or
soup.find("div", class_="article-detail") or # 央视网文章详情容器
soup.find("div", class_="detail-content") or # 央视网详情内容容器
soup.find("div", class_="article-text") or # 央视网文章文本容器
soup.find("div", class_="content-text") or # 央视网内容文本容器
soup.find("div", class_="news-content") or # 央视网新闻内容容器
soup.find("div", class_="news-text") or # 央视网新闻文本容器
soup.find("div", class_="news-detail") or # 央视网新闻详情容器
soup.find("div", class_="article-main") or # 央视网文章主体容器
soup.find("div", class_="article-container") or # 央视网文章容器
soup.find("div", class_="content-container") or # 央视网内容容器
soup.find("div", class_="text-container") or # 央视网文本容器
soup.find("div", class_="main-container") # 央视网主体容器
) )
# 针对央视网的特殊处理,清理内容中的无关元素 # 针对央视网的特殊处理,清理内容中的无关元素
@@ -502,6 +675,17 @@ def process_article(url, website):
# 针对求是的特殊处理,确保标题被正确提取 # 针对求是的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True): if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title") title_tag = soup.find("title")
# 针对求是的特殊处理,如果标题包含"海报"等关键词,尝试从内容中提取更好的标题
if title_tag:
title_text = title_tag.get_text(strip=True)
if "海报" in title_text or "" in title_text:
# 尝试从内容中查找更好的标题
content_h1 = soup.find("h1")
if content_h1 and content_h1 != title_tag:
content_title = content_h1.get_text(strip=True)
if len(content_title) > len(title_text) and "海报" not in content_title:
title_tag = content_h1
content_tag = ( content_tag = (
soup.find("div", class_="content") or soup.find("div", class_="content") or
@@ -819,14 +1003,41 @@ def process_article(url, website):
# 特殊处理人民政协网的标题结构 # 特殊处理人民政协网的标题结构
if title_tag and title_tag.find("span", id="a"): if title_tag and title_tag.find("span", id="a"):
title_tag = title_tag.find("span", id="a") title_tag = title_tag.find("span", id="a")
elif title_tag and title_tag.get_text(strip=True) == "首页>聚焦": elif title_tag and (title_tag.get_text(strip=True) == "首页>聚焦" or title_tag.get_text(strip=True) == "首页 > 聚焦"):
# 如果标题还是"首页>聚焦",尝试从内容中提取标题 # 如果标题还是"首页>聚焦",尝试从内容中提取标题
if content_tag: # 查找文章正文中的第一个strong标签作为标题
first_p = content_tag.find("p") content_div = soup.find("div", class_="text_box")
if content_div:
first_p = content_div.find("p")
if first_p and first_p.find("strong"): if first_p and first_p.find("strong"):
title_text = first_p.find("strong").get_text().strip() title_text = first_p.find("strong").get_text().strip()
# 创建一个虚拟的title_tag对象 # 创建一个虚拟的title_tag对象
title_tag = first_p.find("strong") title_tag = first_p.find("strong")
else:
# 如果没有找到strong标签尝试查找内容中的第一个h2标签
first_h2 = content_div.find("h2")
if first_h2:
title_tag = first_h2
# 针对人民政协网的特殊处理,如果标题包含"首页>聚焦",尝试从页面中查找更好的标题
if title_tag and ("首页>聚焦" in title_tag.get_text(strip=True) or "首页 > 聚焦" in title_tag.get_text(strip=True)):
# 尝试从页面中查找其他可能的标题
for h in soup.find_all(["h1", "h2", "h3"]):
h_text = h.get_text(strip=True)
if h_text and "首页>聚焦" not in h_text and "首页 > 聚焦" not in h_text and len(h_text) > 5:
title_tag = h
break
# 如果还是没找到尝试从title标签提取
if "首页>聚焦" in title_tag.get_text(strip=True) or "首页 > 聚焦" in title_tag.get_text(strip=True):
page_title = soup.find("title")
if page_title:
title_text = page_title.get_text(strip=True)
# 移除网站名称等后缀信息
if " - 人民政协网" in title_text:
title_text = title_text.split(" - 人民政协网")[0]
if "首页>聚焦" not in title_text and "首页 > 聚焦" not in title_text and len(title_text) > 5:
title_tag = page_title
content_tag = ( content_tag = (
soup.find("div", class_="content") or soup.find("div", class_="content") or
@@ -972,7 +1183,14 @@ def process_article(url, website):
soup.find("div", class_="text") or soup.find("div", class_="text") or
soup.find("div", class_="main-content") or soup.find("div", class_="main-content") or
soup.find("div", class_="article") or soup.find("div", class_="article") or
soup.find("div", class_="article-body") soup.find("div", class_="article-body") or
soup.find("div", class_="article-detail") or # 中国青年报文章详情容器
soup.find("div", class_="detail-content") or # 中国青年报详情内容容器
soup.find("div", class_="article-text") or # 中国青年报文章文本容器
soup.find("div", class_="content-text") or # 中国青年报内容文本容器
soup.find("div", class_="news-content") or # 中国青年报新闻内容容器
soup.find("div", class_="news-text") or # 中国青年报新闻文本容器
soup.find("div", class_="news-detail") # 中国青年报新闻详情容器
) )
elif "中国妇女报" in website.name or "cnwomen" in website.name: elif "中国妇女报" in website.name or "cnwomen" in website.name:
# 中国妇女报的文章结构处理 - 修复不保存文章内容问题 # 中国妇女报的文章结构处理 - 修复不保存文章内容问题
@@ -987,6 +1205,9 @@ def process_article(url, website):
title_tag = soup.find("title") title_tag = soup.find("title")
content_tag = ( content_tag = (
soup.find("div", class_="f_container") or # 中国妇女报特有内容容器
soup.find("div", class_="f_container_left") or # 中国妇女报特有内容容器
soup.find("div", class_="f_navigation_bars") or # 中国妇女报特有内容容器
soup.find("div", class_="main") or # 中国妇女报特有内容容器 soup.find("div", class_="main") or # 中国妇女报特有内容容器
soup.find("div", class_="news") or # 中国妇女报特有内容容器 soup.find("div", class_="news") or # 中国妇女报特有内容容器
soup.find("div", class_="content") or soup.find("div", class_="content") or
@@ -995,7 +1216,19 @@ def process_article(url, website):
soup.find("div", class_="text") or soup.find("div", class_="text") or
soup.find("div", class_="main-content") or soup.find("div", class_="main-content") or
soup.find("div", class_="article") or soup.find("div", class_="article") or
soup.find("div", class_="article-body") soup.find("div", class_="article-body") or
soup.find("div", class_="article-detail") or # 中国妇女报文章详情容器
soup.find("div", class_="detail-content") or # 中国妇女报详情内容容器
soup.find("div", class_="article-text") or # 中国妇女报文章文本容器
soup.find("div", class_="content-text") or # 中国妇女报内容文本容器
soup.find("div", class_="news-content") or # 中国妇女报新闻内容容器
soup.find("div", class_="news-text") or # 中国妇女报新闻文本容器
soup.find("div", class_="news-detail") or # 中国妇女报新闻详情容器
soup.find("div", class_="article-main") or # 中国妇女报文章主体容器
soup.find("div", class_="article-container") or # 中国妇女报文章容器
soup.find("div", class_="content-container") or # 中国妇女报内容容器
soup.find("div", class_="text-container") or # 中国妇女报文本容器
soup.find("div", class_="main-container") # 中国妇女报主体容器
) )
elif "法治日报" in website.name or "legaldaily" in website.name: elif "法治日报" in website.name or "legaldaily" in website.name:
# 法治日报的文章结构处理 - 修复不保存正文和图片问题 # 法治日报的文章结构处理 - 修复不保存正文和图片问题
@@ -1080,7 +1313,19 @@ def process_article(url, website):
soup.find("div", class_="text") or soup.find("div", class_="text") or
soup.find("div", class_="main-content") or soup.find("div", class_="main-content") or
soup.find("div", class_="article") or soup.find("div", class_="article") or
soup.find("div", class_="article-body") soup.find("div", class_="article-body") or
soup.find("div", class_="article-detail") or # 农民日报文章详情容器
soup.find("div", class_="detail-content") or # 农民日报详情内容容器
soup.find("div", class_="article-text") or # 农民日报文章文本容器
soup.find("div", class_="content-text") or # 农民日报内容文本容器
soup.find("div", class_="news-content") or # 农民日报新闻内容容器
soup.find("div", class_="news-text") or # 农民日报新闻文本容器
soup.find("div", class_="news-detail") or # 农民日报新闻详情容器
soup.find("div", class_="article-main") or # 农民日报文章主体容器
soup.find("div", class_="article-container") or # 农民日报文章容器
soup.find("div", class_="content-container") or # 农民日报内容容器
soup.find("div", class_="text-container") or # 农民日报文本容器
soup.find("div", class_="main-container") # 农民日报主体容器
) )
# 针对农民日报的特殊处理如果找到多个detailCon选择内容最长的那个 # 针对农民日报的特殊处理如果找到多个detailCon选择内容最长的那个
@@ -1253,11 +1498,29 @@ def process_article(url, website):
soup.find("div", id="content") or soup.find("div", id="content") or
soup.find("div", class_="text") or soup.find("div", class_="text") or
soup.find("div", class_="main-content") or soup.find("div", class_="main-content") or
soup.find("div", class_="article-body") soup.find("div", class_="article-body") or
soup.find("div", class_="news-content") or # 中国网新闻内容容器
soup.find("div", class_="news-text") or # 中国网新闻文本容器
soup.find("div", class_="news-detail") or # 中国网新闻详情容器
soup.find("div", class_="detail-content") or # 中国网详情内容容器
soup.find("div", class_="article-text") or # 中国网文章文本容器
soup.find("div", class_="content-text") # 中国网内容文本容器
) )
# 针对中国网的特殊处理,清理内容中的无关元素 # 针对中国网的特殊处理,清理内容中的无关元素
if content_tag: if content_tag:
# 检查内容质量,过滤掉纯文本内容
content_text = content_tag.get_text(strip=True)
if len(content_text) < 100: # 如果内容太短,可能是纯文本
print(f"中国网内容过短,可能是纯文本,跳过: {url}")
return
# 检查是否包含足够的HTML标签图片、链接等
html_elements = content_tag.find_all(["img", "a", "p", "div", "span"])
if len(html_elements) < 3: # 如果HTML元素太少可能是纯文本
print(f"中国网内容HTML元素过少可能是纯文本跳过: {url}")
return
# 移除编辑信息 # 移除编辑信息
for editor_element in content_tag.find_all("div", class_="editor"): for editor_element in content_tag.find_all("div", class_="editor"):
editor_element.decompose() editor_element.decompose()
@@ -1355,6 +1618,71 @@ def process_article(url, website):
src = value src = value
break break
# 新增:查找新华网特有的视频播放器结构
if not src:
# 查找包含视频信息的script标签
for script in soup.find_all("script"):
if script.string and "video" in script.string.lower():
# 尝试从script中提取视频URL
import re
video_patterns = [
r'https?://[^\s"\']+\.(?:mp4|flv|avi|mov|wmv)',
r'https?://[^\s"\']+video[^\s"\']*',
r'https?://[^\s"\']+media[^\s"\']*'
]
for pattern in video_patterns:
matches = re.findall(pattern, script.string)
if matches:
src = matches[0]
break
if src:
break
# 新增查找新华网特有的iframe视频播放器
if not src:
iframe = soup.find("iframe", src=lambda x: x and ("video" in x or "player" in x))
if iframe:
src = iframe.get("src")
# 新增:查找新华网特有的视频播放器容器
if not src:
video_container = soup.find("div", class_="video-container") or soup.find("div", class_="player-container")
if video_container:
# 在容器中查找视频元素
video_elem = video_container.find("video")
if video_elem:
src = video_elem.get("src") or video_elem.get("data-src")
# 如果没有找到video标签查找source标签
if not src:
source_elem = video_container.find("source")
if source_elem:
src = source_elem.get("src") or source_elem.get("data-src")
# 新增:查找新华网特有的视频链接
if not src:
video_links = soup.find_all("a", href=lambda x: x and ("video" in x or "media" in x))
for link in video_links:
href = link.get("href")
if href and (".mp4" in href or ".flv" in href or "video" in href):
src = href
break
# 新增直接从video标签的属性中获取src处理新华网视频
if not src and video.get("src"):
src = video.get("src")
# 新增处理新华网视频从示例代码中提取src
if not src and "新华网" in website.name:
# 直接从video标签中获取src属性
if video.has_attr('src'):
src = video.get('src')
# 检查是否有完整的属性列表
for attr in video.attrs:
if isinstance(video.attrs[attr], str) and ('.mp4' in video.attrs[attr] or 'vodpub' in video.attrs[attr]):
src = video.attrs[attr]
break
if not src: if not src:
continue continue
@@ -1449,7 +1777,36 @@ def full_site_crawler(start_url, website, max_pages=1000):
print(f"请求失败:{url},错误:{e}") print(f"请求失败:{url},错误:{e}")
continue continue
resp.encoding = 'utf-8' # 针对不同网站设置正确的编码
if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
resp.encoding = 'utf-8'
elif "中国网" in website.name or "china.com.cn" in website.name:
resp.encoding = 'utf-8'
elif "中国新闻社" in website.name or "chinanews" in website.name:
resp.encoding = 'utf-8'
elif "中国政府网" in website.name or "gov.cn" in website.name:
resp.encoding = 'utf-8'
elif "工人日报" in website.name or "workercn" in website.name:
resp.encoding = 'utf-8'
elif "经济日报" in website.name or "ce.cn" in website.name:
resp.encoding = 'utf-8'
elif "求是" in website.name or "qstheory" in website.name:
resp.encoding = 'utf-8'
elif "旗帜网" in website.name or "qizhiwang" in website.name:
resp.encoding = 'utf-8'
elif "人民日报" in website.name or "people" in website.name:
resp.encoding = 'utf-8'
elif "人民政协网" in website.name or "rmzxw" in website.name:
resp.encoding = 'utf-8'
elif "学习时报" in website.name or "studytimes" in website.name:
resp.encoding = 'utf-8'
elif "中国妇女报" in website.name or "cnwomen" in website.name:
resp.encoding = 'utf-8'
elif "中国青年报" in website.name or "cyol" in website.name:
resp.encoding = 'utf-8'
else:
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser") soup = BeautifulSoup(resp.text, "html.parser")
# 根据不同网站判断文章页面 # 根据不同网站判断文章页面