Support CCTV Plamforms

2025-08-15 01:08:53 +08:00
parent ac98ac0057
commit 89909d2781
5 changed files with 18 additions and 18 deletions
--- a/BUG_FIXES_SUMMARY.md
+++ b/BUG_FIXES_SUMMARY.md
@@ -195,3 +195,4 @@
 3. **反爬虫处理**: 添加更复杂的反爬虫绕过机制
 4. **性能监控**: 添加性能监控和统计功能
 5. **内容质量**: 增加内容质量检测和过滤机制
+
--- a/CRAWLER_README.md
+++ b/CRAWLER_README.md
@@ -176,3 +176,4 @@ python manage.py crawl_all_media
 ## 查看结果

 爬取完成后，可以通过Django管理界面或导出命令查看爬取的文章数据。
+
--- a/core/management/commands/crawl_cctv.py
+++ b/core/management/commands/crawl_cctv.py
@@ -3,36 +3,32 @@ from core.models import Website
 from core.utils import full_site_crawler


+# jimmy.fang:20250815: 因 CCTV 的视频有做加密动作，无法下载，移除支持
 class Command(BaseCommand):
    help = "全站递归爬取 中央广播电视总台及其子网站、客户端、新媒体平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
                            choices=['cctv', 'cctvnews', 'mobile', 'all'],
-                            help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), mobile(移动端), all(全部)')
+                            help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), all(全部)')

    def handle(self, *args, **options):
        platform = options['platform']

        # 中央广播电视总台各平台配置
        platforms = {
-            'cctv': {
-                'name': '央视网',
-                'base_url': 'https://www.cctv.com',
-                'start_url': 'https://www.cctv.com',
-                'article_selector': 'a'
-            },
+            # jimmy.fang:20250815: 因 CCTV 的视频有做加密动作，无法下载，移除支持
+            # 'cctv': {
+            #     'name': '央视网',
+            #     'base_url': 'https://www.cctv.com',
+            #     'start_url': 'https://www.cctv.com',
+            #     'article_selector': 'a'
+            # },
            'cctvnews': {
                'name': '央视新闻',
                'base_url': 'https://news.cctv.com',
                'start_url': 'https://news.cctv.com',
                'article_selector': 'a'
-            },
-            'mobile': {
-                'name': '央视移动端',
-                'base_url': 'https://m.cctv.com',
-                'start_url': 'https://m.cctv.com',
-                'article_selector': 'a'
            }
        }

--- a/core/utils.py
+++ b/core/utils.py
@@ -233,8 +233,12 @@ def process_article(url, website):
                soup.find("div", id="content") or
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
-                soup.find("div", class_="article")
+                soup.find("div", class_="article") or
+                # 添加央视新闻特有的内容容器
+                soup.find("div", class_="content_area") or
+                soup.find("div", id="content_area")
        )
+
    elif "求是" in website.name:
        # 求是网站的文章结构处理 - 修复两个标题问题
        title_tag = (
@@ -553,7 +557,6 @@ def process_article(url, website):
        title_tag = (
                soup.find("h1", class_="title") or
                soup.find("h1") or
-                soup.find("p", class_="f_container_title") or  # 添加中国妇女报特有标题容器
                soup.find("title")
        )
        content_tag = (
@@ -563,9 +566,7 @@ def process_article(url, website):
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
                soup.find("div", class_="article") or
-                soup.find("div", class_="article-body") or
-                soup.find("div", class_="f_container_left") or  # 添加中国妇女报特有内容容器
-                soup.find("div", class_="f_container")  # 添加另一种可能的内容容器
+                soup.find("div", class_="article-body")
        )
    elif "法治日报" in website.name or "legaldaily" in website.name:
        # 法治日报的文章结构处理 - 修复无法爬取问题
--- a/test_crawlers.py
+++ b/test_crawlers.py
@@ -119,3 +119,4 @@ def main():

 if __name__ == '__main__':
    main()
+