Fix chinadaily bug : Support more packages

This commit is contained in:
2025-08-15 02:03:13 +08:00
parent 89909d2781
commit 1856f3e9fc
3 changed files with 48 additions and 18 deletions

View File

@@ -3,6 +3,7 @@ from core.models import Website
from core.utils import full_site_crawler from core.utils import full_site_crawler
# jimmy.fang-20250815: 因URL问题移除中国网-省份
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 中国网主网及中国网一省份,不转发二级子网站" help = "全站递归爬取 中国网主网及中国网一省份,不转发二级子网站"
@@ -22,12 +23,12 @@ class Command(BaseCommand):
'start_url': 'http://www.china.com.cn', 'start_url': 'http://www.china.com.cn',
'article_selector': 'a' 'article_selector': 'a'
}, },
'province': { # 'province': {
'name': '中国网一省份', # 'name': '中国网一省份',
'base_url': 'http://www.china.com.cn', # 'base_url': 'http://www.china.com.cn',
'start_url': 'http://www.china.com.cn/province', # 'start_url': 'http://www.china.com.cn/province',
'article_selector': 'a' # 'article_selector': 'a'
} # }
} }
if platform == 'all': if platform == 'all':

View File

@@ -9,7 +9,7 @@ class Command(BaseCommand):
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all', parser.add_argument('--platform', type=str, default='all',
choices=['chinadaily', 'mobile', 'all'], choices=['chinadaily', 'mobile', 'all'],
help='选择爬取平台: chinadaily(中国日报), mobile(移动端), all(全部)') help='选择爬取平台: chinadaily(中国日报), all(全部)')
def handle(self, *args, **options): def handle(self, *args, **options):
platform = options['platform'] platform = options['platform']
@@ -22,12 +22,7 @@ class Command(BaseCommand):
'start_url': 'https://www.chinadaily.com.cn', 'start_url': 'https://www.chinadaily.com.cn',
'article_selector': 'a' 'article_selector': 'a'
}, },
'mobile': {
'name': '中国日报移动端',
'base_url': 'https://m.chinadaily.com.cn',
'start_url': 'https://m.chinadaily.com.cn',
'article_selector': 'a'
}
} }
if platform == 'all': if platform == 'all':

View File

@@ -233,12 +233,8 @@ def process_article(url, website):
soup.find("div", id="content") or soup.find("div", id="content") or
soup.find("div", class_="text") or soup.find("div", class_="text") or
soup.find("div", class_="main-content") or soup.find("div", class_="main-content") or
soup.find("div", class_="article") or soup.find("div", class_="article")
# 添加央视新闻特有的内容容器
soup.find("div", class_="content_area") or
soup.find("div", id="content_area")
) )
elif "求是" in website.name: elif "求是" in website.name:
# 求是网站的文章结构处理 - 修复两个标题问题 # 求是网站的文章结构处理 - 修复两个标题问题
title_tag = ( title_tag = (
@@ -360,6 +356,7 @@ def process_article(url, website):
soup.find("title") soup.find("title")
) )
content_tag = ( content_tag = (
soup.find("div", id="Content") or # 中国日报特有内容容器
soup.find("div", class_="content") or soup.find("div", class_="content") or
soup.find("div", class_="article-content") or soup.find("div", class_="article-content") or
soup.find("div", id="content") or soup.find("div", id="content") or
@@ -368,6 +365,41 @@ def process_article(url, website):
soup.find("div", class_="article") or soup.find("div", class_="article") or
soup.find("div", class_="article-body") soup.find("div", class_="article-body")
) )
# 处理中国日报的分页内容
if content_tag and ("中国日报" in website.name or "chinadaily" in website.name):
# 查找分页链接
page_links = []
current_page_elem = soup.find("div", id="div_currpage")
if current_page_elem:
# 查找所有分页链接
page_links = [a for a in current_page_elem.find_all("a", href=True) if not a.find("img")]
# 如果有分页,收集所有页面内容
if page_links:
print(f"发现分页内容,共 {len(page_links)} 页需要处理")
# 收集所有页面的内容
all_content_html = str(content_tag)
# 处理每个分页链接
for page_link in page_links:
page_url = urljoin(url, page_link['href'])
if page_url != url: # 避免重复处理第一页
try:
page_resp = requests.get(page_url, headers=headers, timeout=15)
page_resp.encoding = 'utf-8'
page_soup = BeautifulSoup(page_resp.text, "html.parser")
# 提取分页内容
page_content = page_soup.find("div", id="Content")
if page_content:
all_content_html += str(page_content)
print(f"已处理分页: {page_url}")
except Exception as e:
print(f"处理分页失败 {page_url}: {e}")
# 更新content_tag为包含所有分页内容
content_tag = BeautifulSoup(all_content_html, "html.parser")
elif "工人日报" in website.name or "workercn" in website.name: elif "工人日报" in website.name or "workercn" in website.name:
# 工人日报的文章结构处理 - 修复不保存文章内容问题 # 工人日报的文章结构处理 - 修复不保存文章内容问题
title_tag = ( title_tag = (
@@ -1003,6 +1035,8 @@ def full_site_crawler(start_url, website, max_pages=1000):
soup.find("div", class_="article-content") is not None or soup.find("div", class_="article-content") is not None or
(soup.find("div", id="content") is not None and (soup.find("div", id="content") is not None and
soup.find("h1") is not None) or soup.find("h1") is not None) or
(soup.find("div", id="Content") is not None and # 中国日报特有内容容器
soup.find("h1") is not None) or
soup.find("div", class_="text") is not None or soup.find("div", class_="text") is not None or
soup.find("div", class_="main-content") is not None or soup.find("div", class_="main-content") is not None or
soup.find("div", class_="article") is not None or soup.find("div", class_="article") is not None or