deploy test

2025-08-17 02:12:25 +08:00
parent 4945b4c6b0
commit 193894fcb4
20 changed files with 413 additions and 56 deletions
--- a/core/management/commands/crawl_cctv.py
+++ b/core/management/commands/crawl_cctv.py
@@ -9,7 +9,7 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['cctv', 'cctvnews', 'all'],
+                            choices=['cctvnews', 'all'],
                            help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), all(全部)')

    def handle(self, *args, **options):
--- a/core/management/commands/crawl_china.py
+++ b/core/management/commands/crawl_china.py
@@ -4,12 +4,12 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 中国网主网及中国网一省份，不转发二级子网站"
+    help = "全站递归爬取 中国网主网"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
                            choices=['china', 'all'],
-                            help='选择爬取平台: china(中国网主网), province(中国网一省份), all(全部)')
+                            help='选择爬取平台: china(中国网主网), all(全部)')

    def handle(self, *args, **options):
        platform = options['platform']
--- a/core/management/commands/crawl_chinadaily.py
+++ b/core/management/commands/crawl_chinadaily.py
@@ -4,11 +4,11 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 中国日报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 中国日报平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['chinadaily', 'mobile', 'all'],
+                            choices=['chinadaily','all'],
                            help='选择爬取平台: chinadaily(中国日报), all(全部)')

    def handle(self, *args, **options):
--- a/core/management/commands/crawl_chinanews.py
+++ b/core/management/commands/crawl_chinanews.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 中国新闻社及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 中国新闻社平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_fzrb.py
+++ b/core/management/commands/crawl_fzrb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 法治日报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 法治日报平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_gmrb.py
+++ b/core/management/commands/crawl_gmrb.py
@@ -5,7 +5,7 @@ from core.utils import full_site_crawler

 # jimmy.fang-20250815: 取消对光明日报的支持，光明日报反爬，被阻挡
 class Command(BaseCommand):
-    help = "全站递归爬取 光明日报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 光明日报平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_grrb.py
+++ b/core/management/commands/crawl_grrb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 工人日报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 工人日报平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_jjrb.py
+++ b/core/management/commands/crawl_jjrb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 经济日报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 经济日报平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_kjrb.py
+++ b/core/management/commands/crawl_kjrb.py
@@ -5,7 +5,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 科技日报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 科技日报平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_pla.py
+++ b/core/management/commands/crawl_pla.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 解放军报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 解放军报平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_qiushi.py
+++ b/core/management/commands/crawl_qiushi.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 求是杂志及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 求是杂志平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_qizhi.py
+++ b/core/management/commands/crawl_qizhi.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 旗帜网及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 旗帜网平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_rmzxb.py
+++ b/core/management/commands/crawl_rmzxb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 人民政协网及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 人民政协网平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_xinhua.py
+++ b/core/management/commands/crawl_xinhua.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 新华社及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 新华社平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_xuexi.py
+++ b/core/management/commands/crawl_xuexi.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 学习强国中央媒体学习号及省级以上学习平台"
+    help = "全站递归爬取 学习强国平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_xxsb.py
+++ b/core/management/commands/crawl_xxsb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 学习时报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 学习时报平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_zgfnb.py
+++ b/core/management/commands/crawl_zgfnb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 中国妇女报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 中国妇女报平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_zgjwjc.py
+++ b/core/management/commands/crawl_zgjwjc.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 中国纪检监察报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 中国纪检监察报平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/management/commands/crawl_zgqnb.py
+++ b/core/management/commands/crawl_zgqnb.py
@@ -4,7 +4,7 @@ from core.utils import full_site_crawler


 class Command(BaseCommand):
-    help = "全站递归爬取 中国青年报及其子网站、客户端、新媒体平台"
+    help = "全站递归爬取 中国青年报平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
--- a/core/utils.py
+++ b/core/utils.py
@@ -54,6 +54,28 @@ def get_page_with_selenium(url, website_name):
            wait_time = 15  # 学习强国需要更长时间
        elif "法治日报" in website_name:
            wait_time = 12  # 法治日报需要较长时间
+        elif "中国新闻社" in website_name or "chinanews" in website_name:
+            wait_time = 12  # 中国新闻社需要较长时间
+        elif "中国政府网" in website_name or "gov.cn" in website_name:
+            wait_time = 12  # 中国政府网需要较长时间
+        elif "工人日报" in website_name or "workercn" in website_name:
+            wait_time = 12  # 工人日报需要较长时间
+        elif "经济日报" in website_name or "ce.cn" in website_name:
+            wait_time = 12  # 经济日报需要较长时间
+        elif "求是" in website_name or "qstheory" in website_name:
+            wait_time = 12  # 求是网需要较长时间
+        elif "旗帜网" in website_name or "qizhiwang" in website_name:
+            wait_time = 12  # 旗帜网需要较长时间
+        elif "人民日报" in website_name or "people" in website_name:
+            wait_time = 12  # 人民日报需要较长时间
+        elif "人民政协网" in website_name or "rmzxw" in website_name:
+            wait_time = 12  # 人民政协网需要较长时间
+        elif "学习时报" in website_name or "studytimes" in website_name:
+            wait_time = 12  # 学习时报需要较长时间
+        elif "中国妇女报" in website_name or "cnwomen" in website_name:
+            wait_time = 12  # 中国妇女报需要较长时间
+        elif "中国青年报" in website_name or "cyol" in website_name:
+            wait_time = 12  # 中国青年报需要较长时间

        # 等待页面主要内容加载
        try:
@@ -234,10 +256,90 @@ def process_article(url, website):
            "Upgrade-Insecure-Requests": "1",
            "Referer": "http://www.qizhiwang.org.cn/"
        })
+    # 添加中国新闻社的特殊请求头
+    elif "中国新闻社" in website.name or "chinanews" in website.name:
+        headers.update({
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "Referer": "https://www.chinanews.com.cn/"
+        })
+    # 添加中国政府网的特殊请求头
+    elif "中国政府网" in website.name or "gov.cn" in website.name:
+        headers.update({
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "Referer": "https://www.gov.cn/"
+        })
+    # 添加经济日报的特殊请求头
+    elif "经济日报" in website.name or "ce.cn" in website.name:
+        headers.update({
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "Referer": "http://www.ce.cn/"
+        })
+    # 添加求是网的特殊请求头
+    elif "求是" in website.name or "qstheory" in website.name:
+        headers.update({
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "Referer": "http://www.qstheory.cn/"
+        })
+    # 添加人民政协网的特殊请求头
+    elif "人民政协网" in website.name or "rmzxw" in website.name:
+        headers.update({
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "Referer": "https://www.rmzxw.com.cn/"
+        })
+    # 添加学习时报的特殊请求头
+    elif "学习时报" in website.name or "studytimes" in website.name:
+        headers.update({
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "Referer": "https://www.studytimes.cn/"
+        })
+    # 添加中国妇女报的特殊请求头
+    elif "中国妇女报" in website.name or "cnwomen" in website.name:
+        headers.update({
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "Referer": "https://www.cnwomen.com.cn/"
+        })
+    # 添加中国青年报的特殊请求头
+    elif "中国青年报" in website.name or "cyol" in website.name:
+        headers.update({
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "Referer": "http://news.cyol.com/"
+        })

    # 判断是否需要使用Selenium
    need_selenium = False
-    if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]):
+    if any(name in website.name for name in ["中国妇女报", "cnwomen", "中国纪检监察报", "jjjcb", "中国青年报", "cyol"]):
        need_selenium = True

    try:
@@ -249,7 +351,7 @@ def process_article(url, website):
                return

            # 检查页面内容是否过短
-            min_length = 100 if "法治日报" in website.name else 300
+            min_length = 200
            if len(page_source) < min_length:
                print(f"页面内容过短，可能是重定向页面：{url}")
                return
@@ -266,6 +368,40 @@ def process_article(url, website):
                print(f"页面内容过短，可能是重定向页面：{url}")
                return

+            # 针对不同网站设置正确的编码
+            if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
+                resp.encoding = 'utf-8'
+            elif "中国网" in website.name or "china.com.cn" in website.name:
+                resp.encoding = 'utf-8'
+            elif "中国新闻社" in website.name or "chinanews" in website.name:
+                resp.encoding = 'utf-8'
+            elif "中国政府网" in website.name or "gov.cn" in website.name:
+                resp.encoding = 'utf-8'
+            elif "工人日报" in website.name or "workercn" in website.name:
+                resp.encoding = 'utf-8'
+            elif "经济日报" in website.name or "ce.cn" in website.name:
+                resp.encoding = 'utf-8'
+            elif "求是" in website.name or "qstheory" in website.name:
+                resp.encoding = 'utf-8'
+            elif "旗帜网" in website.name or "qizhiwang" in website.name:
+                resp.encoding = 'utf-8'
+            elif "人民日报" in website.name or "people" in website.name:
+                resp.encoding = 'utf-8'
+            elif "人民政协网" in website.name or "rmzxw" in website.name:
+                resp.encoding = 'utf-8'
+            elif "学习时报" in website.name or "studytimes" in website.name:
+                resp.encoding = 'utf-8'
+            elif "中国妇女报" in website.name or "cnwomen" in website.name:
+                resp.encoding = 'utf-8'
+            elif "中国青年报" in website.name or "cyol" in website.name:
+                resp.encoding = 'utf-8'
+            elif "学习强国" in website.name or "xuexi" in website.name:
+                resp.encoding = 'utf-8'
+            elif "法治日报" in website.name or "legaldaily" in website.name:
+                resp.encoding = 'utf-8'
+            else:
+                resp.encoding = 'utf-8'
+
            # 创建BeautifulSoup对象
            soup = BeautifulSoup(resp.text, "html.parser")

@@ -274,29 +410,32 @@ def process_article(url, website):
        return

    # 针对不同网站设置正确的编码（仅对requests获取的内容）
-    if not need_selenium:
-        if "人民网" in website.name or "人民日报" in website.name:
-            resp.encoding = 'utf-8'
-        elif "新华网" in website.name:
-            resp.encoding = 'utf-8'
-        elif "央视" in website.name or "CCTV" in website.name:
-            resp.encoding = 'utf-8'
-        elif "农民日报" in website.name or "farmer" in website.name:
-            resp.encoding = 'utf-8'
-            # 尝试其他编码
-            if '' in resp.text or len(resp.text) < 1000:
-                resp.encoding = 'gbk'
-            # 进一步尝试其他编码
-            if '' in resp.text or '' in resp.text:
-                resp.encoding = 'gb2312'
-            # 如果还是有问题，尝试更多编码
-            if '' in resp.text or '' in resp.text:
-                resp.encoding = 'utf-8-sig'
-            # 最后尝试
-            if '' in resp.text or '' in resp.text:
-                resp.encoding = 'big5'
-        else:
-            resp.encoding = 'utf-8'
+    # 注释掉原有的编码处理逻辑，统一使用UTF-8
+    # if not need_selenium:
+    #     if "人民网" in website.name or "人民日报" in website.name:
+    #         resp.encoding = 'utf-8'
+    #     elif "新华网" in website.name:
+    #         resp.encoding = 'utf-8'
+    #     elif "农民日报" in website.name or "farmer" in website.name:
+    #         resp.encoding = 'utf-8'
+    #         # 尝试其他编码
+    #         if '锘' in resp.text or len(resp.text) < 1000:
+    #             resp.encoding = 'gbk'
+    #         # 进一步尝试其他编码
+    #         if '锘' in resp.text or '锘' in resp.text:
+    #             resp.encoding = 'gb2312'
+    #         # 如果还是有问题，尝试更多编码
+    #         if '锘' in resp.text or '锘' in resp.text:
+    #             resp.encoding = 'utf-8-sig'
+    #         # 最后尝试
+    #         if '锘' in resp.text or '锘' in resp.text:
+    #             resp.encoding = 'big5'
+    #     else:
+    #         resp.encoding = 'utf-8'
+    
+    # 统一设置编码为UTF-8，解决乱码问题
+    #if not need_selenium:
+    #    resp.encoding = 'utf-8'

    # 处理不同网站的文章结构
    if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name:
@@ -363,6 +502,16 @@ def process_article(url, website):
                    if title_text in title_element_text or title_element_text in title_text:
                        title_element.decompose()
                
+                # 移除id为ti的元素（中国政府网特有的标题元素）
+                for ti_element in content_tag.find_all(id="ti"):
+                    ti_element.decompose()
+                
+                # 移除包含"简历"等关键词的重复标题
+                for element in content_tag.find_all(["h1", "h2", "h3", "strong", "b"]):
+                    element_text = element.get_text(strip=True)
+                    if "简历" in element_text and len(element_text) < 20:
+                        element.decompose()
+
            # 移除编辑信息
            for editor_element in content_tag.find_all("div", class_="editor"):
                editor_element.decompose()
@@ -394,7 +543,19 @@ def process_article(url, website):
                soup.find("section", class_="content") or
                soup.find("div", class_="article") or
                soup.find("div", class_="rm_txt_con") or  # 添加人民网特有的内容容器
-                soup.find("div", class_="text_c")  # 添加新的内容容器
+                soup.find("div", class_="text_c") or  # 添加新的内容容器
+                soup.find("div", class_="article-detail") or  # 人民日报文章详情容器
+                soup.find("div", class_="detail-content") or  # 人民日报详情内容容器
+                soup.find("div", class_="article-text") or  # 人民日报文章文本容器
+                soup.find("div", class_="content-text") or  # 人民日报内容文本容器
+                soup.find("div", class_="news-content") or  # 人民日报新闻内容容器
+                soup.find("div", class_="news-text") or  # 人民日报新闻文本容器
+                soup.find("div", class_="news-detail") or  # 人民日报新闻详情容器
+                soup.find("div", class_="article-main") or  # 人民日报文章主体容器
+                soup.find("div", class_="article-container") or  # 人民日报文章容器
+                soup.find("div", class_="content-container") or  # 人民日报内容容器
+                soup.find("div", class_="text-container") or  # 人民日报文本容器
+                soup.find("div", class_="main-container")  # 人民日报主体容器
        )

        # 针对人民网的特殊处理，清理内容中的无关元素
@@ -454,7 +615,19 @@ def process_article(url, website):
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
                soup.find("div", class_="article") or
-                soup.find("div", class_="article-body")
+                soup.find("div", class_="article-body") or
+                soup.find("div", class_="article-detail") or  # 央视网文章详情容器
+                soup.find("div", class_="detail-content") or  # 央视网详情内容容器
+                soup.find("div", class_="article-text") or  # 央视网文章文本容器
+                soup.find("div", class_="content-text") or  # 央视网内容文本容器
+                soup.find("div", class_="news-content") or  # 央视网新闻内容容器
+                soup.find("div", class_="news-text") or  # 央视网新闻文本容器
+                soup.find("div", class_="news-detail") or  # 央视网新闻详情容器
+                soup.find("div", class_="article-main") or  # 央视网文章主体容器
+                soup.find("div", class_="article-container") or  # 央视网文章容器
+                soup.find("div", class_="content-container") or  # 央视网内容容器
+                soup.find("div", class_="text-container") or  # 央视网文本容器
+                soup.find("div", class_="main-container")  # 央视网主体容器
        )

        # 针对央视网的特殊处理，清理内容中的无关元素
@@ -503,6 +676,17 @@ def process_article(url, website):
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
        
+        # 针对求是的特殊处理，如果标题包含"海报"等关键词，尝试从内容中提取更好的标题
+        if title_tag:
+            title_text = title_tag.get_text(strip=True)
+            if "海报" in title_text or "图" in title_text:
+                # 尝试从内容中查找更好的标题
+                content_h1 = soup.find("h1")
+                if content_h1 and content_h1 != title_tag:
+                    content_title = content_h1.get_text(strip=True)
+                    if len(content_title) > len(title_text) and "海报" not in content_title:
+                        title_tag = content_h1
+
        content_tag = (
                soup.find("div", class_="content") or
                soup.find("div", class_="article-content") or
@@ -819,14 +1003,41 @@ def process_article(url, website):
        # 特殊处理人民政协网的标题结构
        if title_tag and title_tag.find("span", id="a"):
            title_tag = title_tag.find("span", id="a")
-        elif title_tag and title_tag.get_text(strip=True) == "首页>聚焦":
+        elif title_tag and (title_tag.get_text(strip=True) == "首页>聚焦" or title_tag.get_text(strip=True) == "首页 > 聚焦"):
            # 如果标题还是"首页>聚焦"，尝试从内容中提取标题
-            if content_tag:
-                first_p = content_tag.find("p")
+            # 查找文章正文中的第一个strong标签作为标题
+            content_div = soup.find("div", class_="text_box")
+            if content_div:
+                first_p = content_div.find("p")
                if first_p and first_p.find("strong"):
                    title_text = first_p.find("strong").get_text().strip()
                    # 创建一个虚拟的title_tag对象
                    title_tag = first_p.find("strong")
+                else:
+                    # 如果没有找到strong标签，尝试查找内容中的第一个h2标签
+                    first_h2 = content_div.find("h2")
+                    if first_h2:
+                        title_tag = first_h2
+        
+        # 针对人民政协网的特殊处理，如果标题包含"首页>聚焦"，尝试从页面中查找更好的标题
+        if title_tag and ("首页>聚焦" in title_tag.get_text(strip=True) or "首页 > 聚焦" in title_tag.get_text(strip=True)):
+            # 尝试从页面中查找其他可能的标题
+            for h in soup.find_all(["h1", "h2", "h3"]):
+                h_text = h.get_text(strip=True)
+                if h_text and "首页>聚焦" not in h_text and "首页 > 聚焦" not in h_text and len(h_text) > 5:
+                    title_tag = h
+                    break
+            
+            # 如果还是没找到，尝试从title标签提取
+            if "首页>聚焦" in title_tag.get_text(strip=True) or "首页 > 聚焦" in title_tag.get_text(strip=True):
+                page_title = soup.find("title")
+                if page_title:
+                    title_text = page_title.get_text(strip=True)
+                    # 移除网站名称等后缀信息
+                    if " - 人民政协网" in title_text:
+                        title_text = title_text.split(" - 人民政协网")[0]
+                    if "首页>聚焦" not in title_text and "首页 > 聚焦" not in title_text and len(title_text) > 5:
+                        title_tag = page_title

        content_tag = (
                soup.find("div", class_="content") or
@@ -972,7 +1183,14 @@ def process_article(url, website):
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
                soup.find("div", class_="article") or
-                soup.find("div", class_="article-body")
+                soup.find("div", class_="article-body") or
+                soup.find("div", class_="article-detail") or  # 中国青年报文章详情容器
+                soup.find("div", class_="detail-content") or  # 中国青年报详情内容容器
+                soup.find("div", class_="article-text") or  # 中国青年报文章文本容器
+                soup.find("div", class_="content-text") or  # 中国青年报内容文本容器
+                soup.find("div", class_="news-content") or  # 中国青年报新闻内容容器
+                soup.find("div", class_="news-text") or  # 中国青年报新闻文本容器
+                soup.find("div", class_="news-detail")  # 中国青年报新闻详情容器
        )
    elif "中国妇女报" in website.name or "cnwomen" in website.name:
        # 中国妇女报的文章结构处理 - 修复不保存文章内容问题
@@ -987,6 +1205,9 @@ def process_article(url, website):
            title_tag = soup.find("title")

        content_tag = (
+                soup.find("div", class_="f_container") or  # 中国妇女报特有内容容器
+                soup.find("div", class_="f_container_left") or  # 中国妇女报特有内容容器
+                soup.find("div", class_="f_navigation_bars") or  # 中国妇女报特有内容容器
                soup.find("div", class_="main") or  # 中国妇女报特有内容容器
                soup.find("div", class_="news") or  # 中国妇女报特有内容容器
                soup.find("div", class_="content") or
@@ -995,7 +1216,19 @@ def process_article(url, website):
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
                soup.find("div", class_="article") or
-                soup.find("div", class_="article-body")
+                soup.find("div", class_="article-body") or
+                soup.find("div", class_="article-detail") or  # 中国妇女报文章详情容器
+                soup.find("div", class_="detail-content") or  # 中国妇女报详情内容容器
+                soup.find("div", class_="article-text") or  # 中国妇女报文章文本容器
+                soup.find("div", class_="content-text") or  # 中国妇女报内容文本容器
+                soup.find("div", class_="news-content") or  # 中国妇女报新闻内容容器
+                soup.find("div", class_="news-text") or  # 中国妇女报新闻文本容器
+                soup.find("div", class_="news-detail") or  # 中国妇女报新闻详情容器
+                soup.find("div", class_="article-main") or  # 中国妇女报文章主体容器
+                soup.find("div", class_="article-container") or  # 中国妇女报文章容器
+                soup.find("div", class_="content-container") or  # 中国妇女报内容容器
+                soup.find("div", class_="text-container") or  # 中国妇女报文本容器
+                soup.find("div", class_="main-container")  # 中国妇女报主体容器
        )
    elif "法治日报" in website.name or "legaldaily" in website.name:
        # 法治日报的文章结构处理 - 修复不保存正文和图片问题
@@ -1080,7 +1313,19 @@ def process_article(url, website):
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
                soup.find("div", class_="article") or
-                soup.find("div", class_="article-body")
+                soup.find("div", class_="article-body") or
+                soup.find("div", class_="article-detail") or  # 农民日报文章详情容器
+                soup.find("div", class_="detail-content") or  # 农民日报详情内容容器
+                soup.find("div", class_="article-text") or  # 农民日报文章文本容器
+                soup.find("div", class_="content-text") or  # 农民日报内容文本容器
+                soup.find("div", class_="news-content") or  # 农民日报新闻内容容器
+                soup.find("div", class_="news-text") or  # 农民日报新闻文本容器
+                soup.find("div", class_="news-detail") or  # 农民日报新闻详情容器
+                soup.find("div", class_="article-main") or  # 农民日报文章主体容器
+                soup.find("div", class_="article-container") or  # 农民日报文章容器
+                soup.find("div", class_="content-container") or  # 农民日报内容容器
+                soup.find("div", class_="text-container") or  # 农民日报文本容器
+                soup.find("div", class_="main-container")  # 农民日报主体容器
        )

        # 针对农民日报的特殊处理，如果找到多个detailCon，选择内容最长的那个
@@ -1253,11 +1498,29 @@ def process_article(url, website):
                soup.find("div", id="content") or
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
-                soup.find("div", class_="article-body")
+                soup.find("div", class_="article-body") or
+                soup.find("div", class_="news-content") or  # 中国网新闻内容容器
+                soup.find("div", class_="news-text") or  # 中国网新闻文本容器
+                soup.find("div", class_="news-detail") or  # 中国网新闻详情容器
+                soup.find("div", class_="detail-content") or  # 中国网详情内容容器
+                soup.find("div", class_="article-text") or  # 中国网文章文本容器
+                soup.find("div", class_="content-text")  # 中国网内容文本容器
        )

        # 针对中国网的特殊处理，清理内容中的无关元素
        if content_tag:
+            # 检查内容质量，过滤掉纯文本内容
+            content_text = content_tag.get_text(strip=True)
+            if len(content_text) < 100:  # 如果内容太短，可能是纯文本
+                print(f"中国网内容过短，可能是纯文本，跳过: {url}")
+                return
+            
+            # 检查是否包含足够的HTML标签（图片、链接等）
+            html_elements = content_tag.find_all(["img", "a", "p", "div", "span"])
+            if len(html_elements) < 3:  # 如果HTML元素太少，可能是纯文本
+                print(f"中国网内容HTML元素过少，可能是纯文本，跳过: {url}")
+                return
+            
            # 移除编辑信息
            for editor_element in content_tag.find_all("div", class_="editor"):
                editor_element.decompose()
@@ -1355,6 +1618,71 @@ def process_article(url, website):
                            src = value
                            break

+            # 新增：查找新华网特有的视频播放器结构
+            if not src:
+                # 查找包含视频信息的script标签
+                for script in soup.find_all("script"):
+                    if script.string and "video" in script.string.lower():
+                        # 尝试从script中提取视频URL
+                        import re
+                        video_patterns = [
+                            r'https?://[^\s"\']+\.(?:mp4|flv|avi|mov|wmv)',
+                            r'https?://[^\s"\']+video[^\s"\']*',
+                            r'https?://[^\s"\']+media[^\s"\']*'
+                        ]
+                        for pattern in video_patterns:
+                            matches = re.findall(pattern, script.string)
+                            if matches:
+                                src = matches[0]
+                                break
+                        if src:
+                            break
+
+            # 新增：查找新华网特有的iframe视频播放器
+            if not src:
+                iframe = soup.find("iframe", src=lambda x: x and ("video" in x or "player" in x))
+                if iframe:
+                    src = iframe.get("src")
+            
+            # 新增：查找新华网特有的视频播放器容器
+            if not src:
+                video_container = soup.find("div", class_="video-container") or soup.find("div", class_="player-container")
+                if video_container:
+                    # 在容器中查找视频元素
+                    video_elem = video_container.find("video")
+                    if video_elem:
+                        src = video_elem.get("src") or video_elem.get("data-src")
+                    
+                    # 如果没有找到video标签，查找source标签
+                    if not src:
+                        source_elem = video_container.find("source")
+                        if source_elem:
+                            src = source_elem.get("src") or source_elem.get("data-src")
+            
+            # 新增：查找新华网特有的视频链接
+            if not src:
+                video_links = soup.find_all("a", href=lambda x: x and ("video" in x or "media" in x))
+                for link in video_links:
+                    href = link.get("href")
+                    if href and (".mp4" in href or ".flv" in href or "video" in href):
+                        src = href
+                        break
+
+        # 新增：直接从video标签的属性中获取src（处理新华网视频）
+        if not src and video.get("src"):
+            src = video.get("src")
+            
+        # 新增：处理新华网视频，从示例代码中提取src
+        if not src and "新华网" in website.name:
+            # 直接从video标签中获取src属性
+            if video.has_attr('src'):
+                src = video.get('src')
+            # 检查是否有完整的属性列表
+            for attr in video.attrs:
+                if isinstance(video.attrs[attr], str) and ('.mp4' in video.attrs[attr] or 'vodpub' in video.attrs[attr]):
+                    src = video.attrs[attr]
+                    break
+
        if not src:
            continue

@@ -1449,7 +1777,36 @@ def full_site_crawler(start_url, website, max_pages=1000):
            print(f"请求失败：{url}，错误：{e}")
            continue

+        # 针对不同网站设置正确的编码
+        if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
            resp.encoding = 'utf-8'
+        elif "中国网" in website.name or "china.com.cn" in website.name:
+            resp.encoding = 'utf-8'
+        elif "中国新闻社" in website.name or "chinanews" in website.name:
+            resp.encoding = 'utf-8'
+        elif "中国政府网" in website.name or "gov.cn" in website.name:
+            resp.encoding = 'utf-8'
+        elif "工人日报" in website.name or "workercn" in website.name:
+            resp.encoding = 'utf-8'
+        elif "经济日报" in website.name or "ce.cn" in website.name:
+            resp.encoding = 'utf-8'
+        elif "求是" in website.name or "qstheory" in website.name:
+            resp.encoding = 'utf-8'
+        elif "旗帜网" in website.name or "qizhiwang" in website.name:
+            resp.encoding = 'utf-8'
+        elif "人民日报" in website.name or "people" in website.name:
+            resp.encoding = 'utf-8'
+        elif "人民政协网" in website.name or "rmzxw" in website.name:
+            resp.encoding = 'utf-8'
+        elif "学习时报" in website.name or "studytimes" in website.name:
+            resp.encoding = 'utf-8'
+        elif "中国妇女报" in website.name or "cnwomen" in website.name:
+            resp.encoding = 'utf-8'
+        elif "中国青年报" in website.name or "cyol" in website.name:
+            resp.encoding = 'utf-8'
+        else:
+            resp.encoding = 'utf-8'
+            
        soup = BeautifulSoup(resp.text, "html.parser")

        # 根据不同网站判断文章页面