diff --git a/core/management/commands/crawl_china.py b/core/management/commands/crawl_china.py index 2cd5eaf..4b330ac 100644 --- a/core/management/commands/crawl_china.py +++ b/core/management/commands/crawl_china.py @@ -3,6 +3,7 @@ from core.models import Website from core.utils import full_site_crawler +# jimmy.fang-20250815: 因URL问题,移除中国网-省份 class Command(BaseCommand): help = "全站递归爬取 中国网主网及中国网一省份,不转发二级子网站" @@ -22,12 +23,12 @@ class Command(BaseCommand): 'start_url': 'http://www.china.com.cn', 'article_selector': 'a' }, - 'province': { - 'name': '中国网一省份', - 'base_url': 'http://www.china.com.cn', - 'start_url': 'http://www.china.com.cn/province', - 'article_selector': 'a' - } + # 'province': { + # 'name': '中国网一省份', + # 'base_url': 'http://www.china.com.cn', + # 'start_url': 'http://www.china.com.cn/province', + # 'article_selector': 'a' + # } } if platform == 'all': diff --git a/core/management/commands/crawl_chinadaily.py b/core/management/commands/crawl_chinadaily.py index 69b5e7f..f7b05c6 100644 --- a/core/management/commands/crawl_chinadaily.py +++ b/core/management/commands/crawl_chinadaily.py @@ -9,7 +9,7 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', choices=['chinadaily', 'mobile', 'all'], - help='选择爬取平台: chinadaily(中国日报), mobile(移动端), all(全部)') + help='选择爬取平台: chinadaily(中国日报), all(全部)') def handle(self, *args, **options): platform = options['platform'] @@ -22,12 +22,7 @@ class Command(BaseCommand): 'start_url': 'https://www.chinadaily.com.cn', 'article_selector': 'a' }, - 'mobile': { - 'name': '中国日报移动端', - 'base_url': 'https://m.chinadaily.com.cn', - 'start_url': 'https://m.chinadaily.com.cn', - 'article_selector': 'a' - } + } if platform == 'all': diff --git a/core/utils.py b/core/utils.py index 25aac12..11e1138 100644 --- a/core/utils.py +++ b/core/utils.py @@ -233,12 +233,8 @@ def process_article(url, website): soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or - soup.find("div", class_="article") or - # 添加央视新闻特有的内容容器 - soup.find("div", class_="content_area") or - soup.find("div", id="content_area") + soup.find("div", class_="article") ) - elif "求是" in website.name: # 求是网站的文章结构处理 - 修复两个标题问题 title_tag = ( @@ -360,6 +356,7 @@ def process_article(url, website): soup.find("title") ) content_tag = ( + soup.find("div", id="Content") or # 中国日报特有内容容器 soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or @@ -368,6 +365,41 @@ def process_article(url, website): soup.find("div", class_="article") or soup.find("div", class_="article-body") ) + + # 处理中国日报的分页内容 + if content_tag and ("中国日报" in website.name or "chinadaily" in website.name): + # 查找分页链接 + page_links = [] + current_page_elem = soup.find("div", id="div_currpage") + if current_page_elem: + # 查找所有分页链接 + page_links = [a for a in current_page_elem.find_all("a", href=True) if not a.find("img")] + + # 如果有分页,收集所有页面内容 + if page_links: + print(f"发现分页内容,共 {len(page_links)} 页需要处理") + # 收集所有页面的内容 + all_content_html = str(content_tag) + + # 处理每个分页链接 + for page_link in page_links: + page_url = urljoin(url, page_link['href']) + if page_url != url: # 避免重复处理第一页 + try: + page_resp = requests.get(page_url, headers=headers, timeout=15) + page_resp.encoding = 'utf-8' + page_soup = BeautifulSoup(page_resp.text, "html.parser") + + # 提取分页内容 + page_content = page_soup.find("div", id="Content") + if page_content: + all_content_html += str(page_content) + print(f"已处理分页: {page_url}") + except Exception as e: + print(f"处理分页失败 {page_url}: {e}") + + # 更新content_tag为包含所有分页内容 + content_tag = BeautifulSoup(all_content_html, "html.parser") elif "工人日报" in website.name or "workercn" in website.name: # 工人日报的文章结构处理 - 修复不保存文章内容问题 title_tag = ( @@ -1003,6 +1035,8 @@ def full_site_crawler(start_url, website, max_pages=1000): soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or + (soup.find("div", id="Content") is not None and # 中国日报特有内容容器 + soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or