Fix chinadaily bug : Support more packages

2025-08-15 02:03:13 +08:00
parent 89909d2781
commit 1856f3e9fc
3 changed files with 48 additions and 18 deletions
--- a/core/management/commands/crawl_china.py
+++ b/core/management/commands/crawl_china.py
@@ -3,6 +3,7 @@ from core.models import Website
 from core.utils import full_site_crawler
 # jimmy.fang-20250815: 因URL问题，移除中国网-省份
 class Command(BaseCommand):
    help = "全站递归爬取 中国网主网及中国网一省份，不转发二级子网站"
@@ -22,12 +23,12 @@ class Command(BaseCommand):
                'start_url': 'http://www.china.com.cn',
                'article_selector': 'a'
            },
-            'province': {
+            # 'province': {
-                'name': '中国网一省份',
+            #     'name': '中国网一省份',
-                'base_url': 'http://www.china.com.cn',
+            #     'base_url': 'http://www.china.com.cn',
-                'start_url': 'http://www.china.com.cn/province',
+            #     'start_url': 'http://www.china.com.cn/province',
-                'article_selector': 'a'
+            #     'article_selector': 'a'
-            }
+            # }
        }
        if platform == 'all':
--- a/core/management/commands/crawl_chinadaily.py
+++ b/core/management/commands/crawl_chinadaily.py
@@ -9,7 +9,7 @@ class Command(BaseCommand):
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
                            choices=['chinadaily', 'mobile', 'all'],
-                            help='选择爬取平台: chinadaily(中国日报), mobile(移动端), all(全部)')
+                            help='选择爬取平台: chinadaily(中国日报), all(全部)')
    def handle(self, *args, **options):
        platform = options['platform']
@@ -22,12 +22,7 @@ class Command(BaseCommand):
                'start_url': 'https://www.chinadaily.com.cn',
                'article_selector': 'a'
            },
-            'mobile': {
+
                'name': '中国日报移动端',
                'base_url': 'https://m.chinadaily.com.cn',
                'start_url': 'https://m.chinadaily.com.cn',
                'article_selector': 'a'
            }
        }
        if platform == 'all':
--- a/core/utils.py
+++ b/core/utils.py
@@ -233,12 +233,8 @@ def process_article(url, website):
                soup.find("div", id="content") or
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
-                soup.find("div", class_="article") or
+                soup.find("div", class_="article")
                # 添加央视新闻特有的内容容器
                soup.find("div", class_="content_area") or
                soup.find("div", id="content_area")
        )
    elif "求是" in website.name:
        # 求是网站的文章结构处理 - 修复两个标题问题
        title_tag = (
@@ -360,6 +356,7 @@ def process_article(url, website):
                soup.find("title")
        )
        content_tag = (
                soup.find("div", id="Content") or  # 中国日报特有内容容器
                soup.find("div", class_="content") or
                soup.find("div", class_="article-content") or
                soup.find("div", id="content") or
@@ -368,6 +365,41 @@ def process_article(url, website):
                soup.find("div", class_="article") or
                soup.find("div", class_="article-body")
        )
        # 处理中国日报的分页内容
        if content_tag and ("中国日报" in website.name or "chinadaily" in website.name):
            # 查找分页链接
            page_links = []
            current_page_elem = soup.find("div", id="div_currpage")
            if current_page_elem:
                # 查找所有分页链接
                page_links = [a for a in current_page_elem.find_all("a", href=True) if not a.find("img")]
            # 如果有分页，收集所有页面内容
            if page_links:
                print(f"发现分页内容，共 {len(page_links)} 页需要处理")
                # 收集所有页面的内容
                all_content_html = str(content_tag)
                # 处理每个分页链接
                for page_link in page_links:
                    page_url = urljoin(url, page_link['href'])
                    if page_url != url:  # 避免重复处理第一页
                        try:
                            page_resp = requests.get(page_url, headers=headers, timeout=15)
                            page_resp.encoding = 'utf-8'
                            page_soup = BeautifulSoup(page_resp.text, "html.parser")
                            # 提取分页内容
                            page_content = page_soup.find("div", id="Content")
                            if page_content:
                                all_content_html += str(page_content)
                                print(f"已处理分页: {page_url}")
                        except Exception as e:
                            print(f"处理分页失败 {page_url}: {e}")
                # 更新content_tag为包含所有分页内容
                content_tag = BeautifulSoup(all_content_html, "html.parser")
    elif "工人日报" in website.name or "workercn" in website.name:
        # 工人日报的文章结构处理 - 修复不保存文章内容问题
        title_tag = (
@@ -1003,6 +1035,8 @@ def full_site_crawler(start_url, website, max_pages=1000):
                    soup.find("div", class_="article-content") is not None or
                    (soup.find("div", id="content") is not None and
                     soup.find("h1") is not None) or
                    (soup.find("div", id="Content") is not None and  # 中国日报特有内容容器
                     soup.find("h1") is not None) or
                    soup.find("div", class_="text") is not None or
                    soup.find("div", class_="main-content") is not None or
                    soup.find("div", class_="article") is not None or