deploy test

2025-08-17 02:12:25 +08:00
parent 4945b4c6b0
commit 193894fcb4
20 changed files with 413 additions and 56 deletions
--- a/core/management/commands/crawl_cctv.py
+++ b/core/management/commands/crawl_cctv.py
@@ -9,7 +9,7 @@ class Command(BaseCommand):
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['cctv', 'cctvnews', 'all'],
+                            choices=['cctvnews', 'all'],
                            help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), all(全部)')
    def handle(self, *args, **options):
--- a/core/management/commands/crawl_china.py
+++ b/core/management/commands/crawl_china.py
@@ -4,12 +4,12 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 中国网主网及中国网一省份，不转发二级子网站"
+    help = "全站递归爬取 中国网主网"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
                            choices=['china', 'all'],
-                            help='选择爬取平台: china(中国网主网), province(中国网一省份), all(全部)')
+                            help='选择爬取平台: china(中国网主网), all(全部)')
    def handle(self, *args, **options):
        platform = options['platform']
--- a/core/management/commands/crawl_chinadaily.py
+++ b/core/management/commands/crawl_chinadaily.py
@@ -4,11 +4,11 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 中国日报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 中国日报平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['chinadaily', 'mobile', 'all'],
+                            choices=['chinadaily','all'],
                            help='选择爬取平台: chinadaily(中国日报), all(全部)')
    def handle(self, *args, **options):
--- a/core/management/commands/crawl_chinanews.py
+++ b/core/management/commands/crawl_chinanews.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 中国新闻社及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 中国新闻社平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_fzrb.py
+++ b/core/management/commands/crawl_fzrb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 法治日报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 法治日报平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_gmrb.py
+++ b/core/management/commands/crawl_gmrb.py
@@ -5,7 +5,7 @@ from core.utils import full_site_crawler
 # jimmy.fang-20250815: 取消对光明日报的支持，光明日报反爬，被阻挡
 class Command(BaseCommand):
-    help = "全站递归爬取 光明日报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 光明日报平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_grrb.py
+++ b/core/management/commands/crawl_grrb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 工人日报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 工人日报平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_jjrb.py
+++ b/core/management/commands/crawl_jjrb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 经济日报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 经济日报平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_kjrb.py
+++ b/core/management/commands/crawl_kjrb.py
@@ -5,7 +5,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 科技日报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 科技日报平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_pla.py
+++ b/core/management/commands/crawl_pla.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 解放军报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 解放军报平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_qiushi.py
+++ b/core/management/commands/crawl_qiushi.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 求是杂志及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 求是杂志平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_qizhi.py
+++ b/core/management/commands/crawl_qizhi.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 旗帜网及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 旗帜网平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_rmzxb.py
+++ b/core/management/commands/crawl_rmzxb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 人民政协网及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 人民政协网平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_xinhua.py
+++ b/core/management/commands/crawl_xinhua.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 新华社及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 新华社平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_xuexi.py
+++ b/core/management/commands/crawl_xuexi.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 学习强国中央媒体学习号及省级以上学习平台"
+    help = "全站递归爬取 学习强国平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_xxsb.py
+++ b/core/management/commands/crawl_xxsb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 学习时报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 学习时报平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_zgfnb.py
+++ b/core/management/commands/crawl_zgfnb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 中国妇女报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 中国妇女报平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_zgjwjc.py
+++ b/core/management/commands/crawl_zgjwjc.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 中国纪检监察报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 中国纪检监察报平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_zgqnb.py
+++ b/core/management/commands/crawl_zgqnb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 中国青年报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 中国青年报平台"
    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/utils.py
+++ b/core/utils.py
@@ -54,6 +54,28 @@ def get_page_with_selenium(url, website_name):
            wait_time = 15  # 学习强国需要更长时间
        elif "法治日报" in website_name:
            wait_time = 12  # 法治日报需要较长时间
        elif "中国新闻社" in website_name or "chinanews" in website_name:
            wait_time = 12  # 中国新闻社需要较长时间
        elif "中国政府网" in website_name or "gov.cn" in website_name:
            wait_time = 12  # 中国政府网需要较长时间
        elif "工人日报" in website_name or "workercn" in website_name:
            wait_time = 12  # 工人日报需要较长时间
        elif "经济日报" in website_name or "ce.cn" in website_name:
            wait_time = 12  # 经济日报需要较长时间
        elif "求是" in website_name or "qstheory" in website_name:
            wait_time = 12  # 求是网需要较长时间
        elif "旗帜网" in website_name or "qizhiwang" in website_name:
            wait_time = 12  # 旗帜网需要较长时间
        elif "人民日报" in website_name or "people" in website_name:
            wait_time = 12  # 人民日报需要较长时间
        elif "人民政协网" in website_name or "rmzxw" in website_name:
            wait_time = 12  # 人民政协网需要较长时间
        elif "学习时报" in website_name or "studytimes" in website_name:
            wait_time = 12  # 学习时报需要较长时间
        elif "中国妇女报" in website_name or "cnwomen" in website_name:
            wait_time = 12  # 中国妇女报需要较长时间
        elif "中国青年报" in website_name or "cyol" in website_name:
            wait_time = 12  # 中国青年报需要较长时间
        # 等待页面主要内容加载
        try:
@@ -234,10 +256,90 @@ def process_article(url, website):
            "Upgrade-Insecure-Requests": "1",
            "Referer": "http://www.qizhiwang.org.cn/"
        })
    # 添加中国新闻社的特殊请求头
    elif "中国新闻社" in website.name or "chinanews" in website.name:
        headers.update({
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Referer": "https://www.chinanews.com.cn/"
        })
    # 添加中国政府网的特殊请求头
    elif "中国政府网" in website.name or "gov.cn" in website.name:
        headers.update({
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Referer": "https://www.gov.cn/"
        })
    # 添加经济日报的特殊请求头
    elif "经济日报" in website.name or "ce.cn" in website.name:
        headers.update({
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Referer": "http://www.ce.cn/"
        })
    # 添加求是网的特殊请求头
    elif "求是" in website.name or "qstheory" in website.name:
        headers.update({
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Referer": "http://www.qstheory.cn/"
        })
    # 添加人民政协网的特殊请求头
    elif "人民政协网" in website.name or "rmzxw" in website.name:
        headers.update({
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Referer": "https://www.rmzxw.com.cn/"
        })
    # 添加学习时报的特殊请求头
    elif "学习时报" in website.name or "studytimes" in website.name:
        headers.update({
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Referer": "https://www.studytimes.cn/"
        })
    # 添加中国妇女报的特殊请求头
    elif "中国妇女报" in website.name or "cnwomen" in website.name:
        headers.update({
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Referer": "https://www.cnwomen.com.cn/"
        })
    # 添加中国青年报的特殊请求头
    elif "中国青年报" in website.name or "cyol" in website.name:
        headers.update({
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Referer": "http://news.cyol.com/"
        })
    # 判断是否需要使用Selenium
    need_selenium = False
-    if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]):
+    if any(name in website.name for name in ["中国妇女报", "cnwomen", "中国纪检监察报", "jjjcb", "中国青年报", "cyol"]):
        need_selenium = True
    try:
@@ -249,7 +351,7 @@ def process_article(url, website):
                return
            # 检查页面内容是否过短
-            min_length = 100 if "法治日报" in website.name else 300
+            min_length = 200
            if len(page_source) < min_length:
                print(f"页面内容过短，可能是重定向页面：{url}")
                return
@@ -266,6 +368,40 @@ def process_article(url, website):
                print(f"页面内容过短，可能是重定向页面：{url}")
                return
            # 针对不同网站设置正确的编码
            if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
                resp.encoding = 'utf-8'
            elif "中国网" in website.name or "china.com.cn" in website.name:
                resp.encoding = 'utf-8'
            elif "中国新闻社" in website.name or "chinanews" in website.name:
                resp.encoding = 'utf-8'
            elif "中国政府网" in website.name or "gov.cn" in website.name:
                resp.encoding = 'utf-8'
            elif "工人日报" in website.name or "workercn" in website.name:
                resp.encoding = 'utf-8'
            elif "经济日报" in website.name or "ce.cn" in website.name:
                resp.encoding = 'utf-8'
            elif "求是" in website.name or "qstheory" in website.name:
                resp.encoding = 'utf-8'
            elif "旗帜网" in website.name or "qizhiwang" in website.name:
                resp.encoding = 'utf-8'
            elif "人民日报" in website.name or "people" in website.name:
                resp.encoding = 'utf-8'
            elif "人民政协网" in website.name or "rmzxw" in website.name:
                resp.encoding = 'utf-8'
            elif "学习时报" in website.name or "studytimes" in website.name:
                resp.encoding = 'utf-8'
            elif "中国妇女报" in website.name or "cnwomen" in website.name:
                resp.encoding = 'utf-8'
            elif "中国青年报" in website.name or "cyol" in website.name:
                resp.encoding = 'utf-8'
            elif "学习强国" in website.name or "xuexi" in website.name:
                resp.encoding = 'utf-8'
            elif "法治日报" in website.name or "legaldaily" in website.name:
                resp.encoding = 'utf-8'
            else:
                resp.encoding = 'utf-8'
            # 创建BeautifulSoup对象
            soup = BeautifulSoup(resp.text, "html.parser")
@@ -274,29 +410,32 @@ def process_article(url, website):
        return
    # 针对不同网站设置正确的编码（仅对requests获取的内容）
-    if not need_selenium:
+    # 注释掉原有的编码处理逻辑，统一使用UTF-8
-        if "人民网" in website.name or "人民日报" in website.name:
+    # if not need_selenium:
-            resp.encoding = 'utf-8'
+    #     if "人民网" in website.name or "人民日报" in website.name:
-        elif "新华网" in website.name:
+    #         resp.encoding = 'utf-8'
-            resp.encoding = 'utf-8'
+    #     elif "新华网" in website.name:
-        elif "央视" in website.name or "CCTV" in website.name:
+    #         resp.encoding = 'utf-8'
-            resp.encoding = 'utf-8'
+    #     elif "农民日报" in website.name or "farmer" in website.name:
-        elif "农民日报" in website.name or "farmer" in website.name:
+    #         resp.encoding = 'utf-8'
-            resp.encoding = 'utf-8'
+    #         # 尝试其他编码
-            # 尝试其他编码
+    #         if '锘' in resp.text or len(resp.text) < 1000:
-            if '' in resp.text or len(resp.text) < 1000:
+    #             resp.encoding = 'gbk'
-                resp.encoding = 'gbk'
+    #         # 进一步尝试其他编码
-            # 进一步尝试其他编码
+    #         if '锘' in resp.text or '锘' in resp.text:
-            if '' in resp.text or '' in resp.text:
+    #             resp.encoding = 'gb2312'
-                resp.encoding = 'gb2312'
+    #         # 如果还是有问题，尝试更多编码
-            # 如果还是有问题，尝试更多编码
+    #         if '锘' in resp.text or '锘' in resp.text:
-            if '' in resp.text or '' in resp.text:
+    #             resp.encoding = 'utf-8-sig'
-                resp.encoding = 'utf-8-sig'
+    #         # 最后尝试
-            # 最后尝试
+    #         if '锘' in resp.text or '锘' in resp.text:
-            if '' in resp.text or '' in resp.text:
+    #             resp.encoding = 'big5'
-                resp.encoding = 'big5'
+    #     else:
-        else:
+    #         resp.encoding = 'utf-8'
-            resp.encoding = 'utf-8'
+    
    # 统一设置编码为UTF-8，解决乱码问题
    #if not need_selenium:
    #    resp.encoding = 'utf-8'
    # 处理不同网站的文章结构
    if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name:
@@ -362,6 +501,16 @@ def process_article(url, website):
                    title_element_text = title_element.get_text(strip=True)
                    if title_text in title_element_text or title_element_text in title_text:
                        title_element.decompose()
                # 移除id为ti的元素（中国政府网特有的标题元素）
                for ti_element in content_tag.find_all(id="ti"):
                    ti_element.decompose()
                # 移除包含"简历"等关键词的重复标题
                for element in content_tag.find_all(["h1", "h2", "h3", "strong", "b"]):
                    element_text = element.get_text(strip=True)
                    if "简历" in element_text and len(element_text) < 20:
                        element.decompose()
            # 移除编辑信息
            for editor_element in content_tag.find_all("div", class_="editor"):
@@ -394,7 +543,19 @@ def process_article(url, website):
                soup.find("section", class_="content") or
                soup.find("div", class_="article") or
                soup.find("div", class_="rm_txt_con") or  # 添加人民网特有的内容容器
-                soup.find("div", class_="text_c")  # 添加新的内容容器
+                soup.find("div", class_="text_c") or  # 添加新的内容容器
                soup.find("div", class_="article-detail") or  # 人民日报文章详情容器
                soup.find("div", class_="detail-content") or  # 人民日报详情内容容器
                soup.find("div", class_="article-text") or  # 人民日报文章文本容器
                soup.find("div", class_="content-text") or  # 人民日报内容文本容器
                soup.find("div", class_="news-content") or  # 人民日报新闻内容容器
                soup.find("div", class_="news-text") or  # 人民日报新闻文本容器
                soup.find("div", class_="news-detail") or  # 人民日报新闻详情容器
                soup.find("div", class_="article-main") or  # 人民日报文章主体容器
                soup.find("div", class_="article-container") or  # 人民日报文章容器
                soup.find("div", class_="content-container") or  # 人民日报内容容器
                soup.find("div", class_="text-container") or  # 人民日报文本容器
                soup.find("div", class_="main-container")  # 人民日报主体容器
        )
        # 针对人民网的特殊处理，清理内容中的无关元素
@@ -454,7 +615,19 @@ def process_article(url, website):
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
                soup.find("div", class_="article") or
-                soup.find("div", class_="article-body")
+                soup.find("div", class_="article-body") or
                soup.find("div", class_="article-detail") or  # 央视网文章详情容器
                soup.find("div", class_="detail-content") or  # 央视网详情内容容器
                soup.find("div", class_="article-text") or  # 央视网文章文本容器
                soup.find("div", class_="content-text") or  # 央视网内容文本容器
                soup.find("div", class_="news-content") or  # 央视网新闻内容容器
                soup.find("div", class_="news-text") or  # 央视网新闻文本容器
                soup.find("div", class_="news-detail") or  # 央视网新闻详情容器
                soup.find("div", class_="article-main") or  # 央视网文章主体容器
                soup.find("div", class_="article-container") or  # 央视网文章容器
                soup.find("div", class_="content-container") or  # 央视网内容容器
                soup.find("div", class_="text-container") or  # 央视网文本容器
                soup.find("div", class_="main-container")  # 央视网主体容器
        )
        # 针对央视网的特殊处理，清理内容中的无关元素
@@ -502,6 +675,17 @@ def process_article(url, website):
        # 针对求是的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
        # 针对求是的特殊处理，如果标题包含"海报"等关键词，尝试从内容中提取更好的标题
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if "海报" in title_text or "图" in title_text:
                # 尝试从内容中查找更好的标题
                content_h1 = soup.find("h1")
                if content_h1 and content_h1 != title_tag:
                    content_title = content_h1.get_text(strip=True)
                    if len(content_title) > len(title_text) and "海报" not in content_title:
                        title_tag = content_h1
        content_tag = (
                soup.find("div", class_="content") or
@@ -819,14 +1003,41 @@ def process_article(url, website):
        # 特殊处理人民政协网的标题结构
        if title_tag and title_tag.find("span", id="a"):
            title_tag = title_tag.find("span", id="a")
-        elif title_tag and title_tag.get_text(strip=True) == "首页>聚焦":
+        elif title_tag and (title_tag.get_text(strip=True) == "首页>聚焦" or title_tag.get_text(strip=True) == "首页 > 聚焦"):
            # 如果标题还是"首页>聚焦"，尝试从内容中提取标题
-            if content_tag:
+            # 查找文章正文中的第一个strong标签作为标题
-                first_p = content_tag.find("p")
+            content_div = soup.find("div", class_="text_box")
            if content_div:
                first_p = content_div.find("p")
                if first_p and first_p.find("strong"):
                    title_text = first_p.find("strong").get_text().strip()
                    # 创建一个虚拟的title_tag对象
                    title_tag = first_p.find("strong")
                else:
                    # 如果没有找到strong标签，尝试查找内容中的第一个h2标签
                    first_h2 = content_div.find("h2")
                    if first_h2:
                        title_tag = first_h2
        # 针对人民政协网的特殊处理，如果标题包含"首页>聚焦"，尝试从页面中查找更好的标题
        if title_tag and ("首页>聚焦" in title_tag.get_text(strip=True) or "首页 > 聚焦" in title_tag.get_text(strip=True)):
            # 尝试从页面中查找其他可能的标题
            for h in soup.find_all(["h1", "h2", "h3"]):
                h_text = h.get_text(strip=True)
                if h_text and "首页>聚焦" not in h_text and "首页 > 聚焦" not in h_text and len(h_text) > 5:
                    title_tag = h
                    break
            # 如果还是没找到，尝试从title标签提取
            if "首页>聚焦" in title_tag.get_text(strip=True) or "首页 > 聚焦" in title_tag.get_text(strip=True):
                page_title = soup.find("title")
                if page_title:
                    title_text = page_title.get_text(strip=True)
                    # 移除网站名称等后缀信息
                    if " - 人民政协网" in title_text:
                        title_text = title_text.split(" - 人民政协网")[0]
                    if "首页>聚焦" not in title_text and "首页 > 聚焦" not in title_text and len(title_text) > 5:
                        title_tag = page_title
        content_tag = (
                soup.find("div", class_="content") or
@@ -972,7 +1183,14 @@ def process_article(url, website):
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
                soup.find("div", class_="article") or
-                soup.find("div", class_="article-body")
+                soup.find("div", class_="article-body") or
                soup.find("div", class_="article-detail") or  # 中国青年报文章详情容器
                soup.find("div", class_="detail-content") or  # 中国青年报详情内容容器
                soup.find("div", class_="article-text") or  # 中国青年报文章文本容器
                soup.find("div", class_="content-text") or  # 中国青年报内容文本容器
                soup.find("div", class_="news-content") or  # 中国青年报新闻内容容器
                soup.find("div", class_="news-text") or  # 中国青年报新闻文本容器
                soup.find("div", class_="news-detail")  # 中国青年报新闻详情容器
        )
    elif "中国妇女报" in website.name or "cnwomen" in website.name:
        # 中国妇女报的文章结构处理 - 修复不保存文章内容问题
@@ -987,6 +1205,9 @@ def process_article(url, website):
            title_tag = soup.find("title")
        content_tag = (
                soup.find("div", class_="f_container") or  # 中国妇女报特有内容容器
                soup.find("div", class_="f_container_left") or  # 中国妇女报特有内容容器
                soup.find("div", class_="f_navigation_bars") or  # 中国妇女报特有内容容器
                soup.find("div", class_="main") or  # 中国妇女报特有内容容器
                soup.find("div", class_="news") or  # 中国妇女报特有内容容器
                soup.find("div", class_="content") or
@@ -995,7 +1216,19 @@ def process_article(url, website):
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
                soup.find("div", class_="article") or
-                soup.find("div", class_="article-body")
+                soup.find("div", class_="article-body") or
                soup.find("div", class_="article-detail") or  # 中国妇女报文章详情容器
                soup.find("div", class_="detail-content") or  # 中国妇女报详情内容容器
                soup.find("div", class_="article-text") or  # 中国妇女报文章文本容器
                soup.find("div", class_="content-text") or  # 中国妇女报内容文本容器
                soup.find("div", class_="news-content") or  # 中国妇女报新闻内容容器
                soup.find("div", class_="news-text") or  # 中国妇女报新闻文本容器
                soup.find("div", class_="news-detail") or  # 中国妇女报新闻详情容器
                soup.find("div", class_="article-main") or  # 中国妇女报文章主体容器
                soup.find("div", class_="article-container") or  # 中国妇女报文章容器
                soup.find("div", class_="content-container") or  # 中国妇女报内容容器
                soup.find("div", class_="text-container") or  # 中国妇女报文本容器
                soup.find("div", class_="main-container")  # 中国妇女报主体容器
        )
    elif "法治日报" in website.name or "legaldaily" in website.name:
        # 法治日报的文章结构处理 - 修复不保存正文和图片问题
@@ -1080,7 +1313,19 @@ def process_article(url, website):
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
                soup.find("div", class_="article") or
-                soup.find("div", class_="article-body")
+                soup.find("div", class_="article-body") or
                soup.find("div", class_="article-detail") or  # 农民日报文章详情容器
                soup.find("div", class_="detail-content") or  # 农民日报详情内容容器
                soup.find("div", class_="article-text") or  # 农民日报文章文本容器
                soup.find("div", class_="content-text") or  # 农民日报内容文本容器
                soup.find("div", class_="news-content") or  # 农民日报新闻内容容器
                soup.find("div", class_="news-text") or  # 农民日报新闻文本容器
                soup.find("div", class_="news-detail") or  # 农民日报新闻详情容器
                soup.find("div", class_="article-main") or  # 农民日报文章主体容器
                soup.find("div", class_="article-container") or  # 农民日报文章容器
                soup.find("div", class_="content-container") or  # 农民日报内容容器
                soup.find("div", class_="text-container") or  # 农民日报文本容器
                soup.find("div", class_="main-container")  # 农民日报主体容器
        )
        # 针对农民日报的特殊处理，如果找到多个detailCon，选择内容最长的那个
@@ -1253,11 +1498,29 @@ def process_article(url, website):
                soup.find("div", id="content") or
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
-                soup.find("div", class_="article-body")
+                soup.find("div", class_="article-body") or
                soup.find("div", class_="news-content") or  # 中国网新闻内容容器
                soup.find("div", class_="news-text") or  # 中国网新闻文本容器
                soup.find("div", class_="news-detail") or  # 中国网新闻详情容器
                soup.find("div", class_="detail-content") or  # 中国网详情内容容器
                soup.find("div", class_="article-text") or  # 中国网文章文本容器
                soup.find("div", class_="content-text")  # 中国网内容文本容器
        )
        # 针对中国网的特殊处理，清理内容中的无关元素
        if content_tag:
            # 检查内容质量，过滤掉纯文本内容
            content_text = content_tag.get_text(strip=True)
            if len(content_text) < 100:  # 如果内容太短，可能是纯文本
                print(f"中国网内容过短，可能是纯文本，跳过: {url}")
                return
            # 检查是否包含足够的HTML标签（图片、链接等）
            html_elements = content_tag.find_all(["img", "a", "p", "div", "span"])
            if len(html_elements) < 3:  # 如果HTML元素太少，可能是纯文本
                print(f"中国网内容HTML元素过少，可能是纯文本，跳过: {url}")
                return
            # 移除编辑信息
            for editor_element in content_tag.find_all("div", class_="editor"):
                editor_element.decompose()
@@ -1355,6 +1618,71 @@ def process_article(url, website):
                            src = value
                            break
            # 新增：查找新华网特有的视频播放器结构
            if not src:
                # 查找包含视频信息的script标签
                for script in soup.find_all("script"):
                    if script.string and "video" in script.string.lower():
                        # 尝试从script中提取视频URL
                        import re
                        video_patterns = [
                            r'https?://[^\s"\']+\.(?:mp4|flv|avi|mov|wmv)',
                            r'https?://[^\s"\']+video[^\s"\']*',
                            r'https?://[^\s"\']+media[^\s"\']*'
                        ]
                        for pattern in video_patterns:
                            matches = re.findall(pattern, script.string)
                            if matches:
                                src = matches[0]
                                break
                        if src:
                            break
            # 新增：查找新华网特有的iframe视频播放器
            if not src:
                iframe = soup.find("iframe", src=lambda x: x and ("video" in x or "player" in x))
                if iframe:
                    src = iframe.get("src")
            # 新增：查找新华网特有的视频播放器容器
            if not src:
                video_container = soup.find("div", class_="video-container") or soup.find("div", class_="player-container")
                if video_container:
                    # 在容器中查找视频元素
                    video_elem = video_container.find("video")
                    if video_elem:
                        src = video_elem.get("src") or video_elem.get("data-src")
                    # 如果没有找到video标签，查找source标签
                    if not src:
                        source_elem = video_container.find("source")
                        if source_elem:
                            src = source_elem.get("src") or source_elem.get("data-src")
            # 新增：查找新华网特有的视频链接
            if not src:
                video_links = soup.find_all("a", href=lambda x: x and ("video" in x or "media" in x))
                for link in video_links:
                    href = link.get("href")
                    if href and (".mp4" in href or ".flv" in href or "video" in href):
                        src = href
                        break
        # 新增：直接从video标签的属性中获取src（处理新华网视频）
        if not src and video.get("src"):
            src = video.get("src")
        # 新增：处理新华网视频，从示例代码中提取src
        if not src and "新华网" in website.name:
            # 直接从video标签中获取src属性
            if video.has_attr('src'):
                src = video.get('src')
            # 检查是否有完整的属性列表
            for attr in video.attrs:
                if isinstance(video.attrs[attr], str) and ('.mp4' in video.attrs[attr] or 'vodpub' in video.attrs[attr]):
                    src = video.attrs[attr]
                    break
        if not src:
            continue
@@ -1449,7 +1777,36 @@ def full_site_crawler(start_url, website, max_pages=1000):
            print(f"请求失败：{url}，错误：{e}")
            continue
-        resp.encoding = 'utf-8'
+        # 针对不同网站设置正确的编码
        if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
            resp.encoding = 'utf-8'
        elif "中国网" in website.name or "china.com.cn" in website.name:
            resp.encoding = 'utf-8'
        elif "中国新闻社" in website.name or "chinanews" in website.name:
            resp.encoding = 'utf-8'
        elif "中国政府网" in website.name or "gov.cn" in website.name:
            resp.encoding = 'utf-8'
        elif "工人日报" in website.name or "workercn" in website.name:
            resp.encoding = 'utf-8'
        elif "经济日报" in website.name or "ce.cn" in website.name:
            resp.encoding = 'utf-8'
        elif "求是" in website.name or "qstheory" in website.name:
            resp.encoding = 'utf-8'
        elif "旗帜网" in website.name or "qizhiwang" in website.name:
            resp.encoding = 'utf-8'
        elif "人民日报" in website.name or "people" in website.name:
            resp.encoding = 'utf-8'
        elif "人民政协网" in website.name or "rmzxw" in website.name:
            resp.encoding = 'utf-8'
        elif "学习时报" in website.name or "studytimes" in website.name:
            resp.encoding = 'utf-8'
        elif "中国妇女报" in website.name or "cnwomen" in website.name:
            resp.encoding = 'utf-8'
        elif "中国青年报" in website.name or "cyol" in website.name:
            resp.encoding = 'utf-8'
        else:
            resp.encoding = 'utf-8'
        soup = BeautifulSoup(resp.text, "html.parser")
        # 根据不同网站判断文章页面