fix all bug

2025-08-15 05:58:16 +08:00
parent c4dfc515f7
commit e82b85f4dd
1 changed files with 207 additions and 31 deletions
--- a/core/utils.py
+++ b/core/utils.py
@@ -7,8 +7,77 @@ from django.utils import timezone
 from django.conf import settings
 from core.models import Article
 import re
 import time
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from webdriver_manager.chrome import ChromeDriverManager
 def get_selenium_driver():
    """获取Selenium WebDriver实例"""
    try:
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # 无头模式
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        return driver
    except Exception as e:
        print(f"创建Selenium WebDriver失败: {e}")
        return None
 def get_page_with_selenium(url, website_name):
    """使用Selenium获取动态加载的页面内容"""
    driver = None
    try:
        driver = get_selenium_driver()
        if not driver:
            return None
        print(f"使用Selenium加载页面: {url}")
        driver.get(url)
        # 等待页面加载完成
        wait_time = 10
        if "学习强国" in website_name:
            wait_time = 15  # 学习强国需要更长时间
        elif "法治日报" in website_name:
            wait_time = 12  # 法治日报需要较长时间
        # 等待页面主要内容加载
        try:
            WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
        except:
            print(f"等待页面加载超时: {url}")
        # 额外等待时间确保动态内容加载完成
        time.sleep(3)
        # 获取页面源码
        page_source = driver.page_source
        return page_source
    except Exception as e:
        print(f"Selenium获取页面失败: {url}, 错误: {e}")
        return None
    finally:
        if driver:
            try:
                driver.quit()
            except:
                pass
 def download_media(url, save_dir):
    try:
        # 添加请求头以避免403 Forbidden错误
@@ -163,37 +232,68 @@ def process_article(url, website):
            "Referer": "http://www.qizhiwang.org.cn/"
        })
-    try:
+    # 判断是否需要使用Selenium
-        resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
+    need_selenium = False
-        resp.raise_for_status()
+    if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]):
        need_selenium = True
    try:
        if need_selenium:
            # 使用Selenium获取动态加载的内容
            page_source = get_page_with_selenium(url, website.name)
            if not page_source:
                print(f"Selenium获取页面失败：{url}")
                return
            # 检查页面内容是否过短
            min_length = 100 if "法治日报" in website.name else 300
            if len(page_source) < min_length:
                print(f"页面内容过短，可能是重定向页面：{url}")
                return
            # 创建BeautifulSoup对象
            soup = BeautifulSoup(page_source, "html.parser")
        else:
            # 使用requests获取静态内容
            resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
            resp.raise_for_status()
            # 检查是否是重定向页面
            if len(resp.text) < 300:
                print(f"页面内容过短，可能是重定向页面：{url}")
                return
            # 创建BeautifulSoup对象
            soup = BeautifulSoup(resp.text, "html.parser")
        # 检查是否是重定向页面
        if len(resp.text) < 500:
            print(f"页面内容过短，可能是重定向页面：{url}")
            return
    except Exception as e:
        print(f"请求失败：{url}，错误：{e}")
        return
-    # 针对不同网站设置正确的编码
+    # 针对不同网站设置正确的编码（仅对requests获取的内容）
-    if "人民网" in website.name or "人民日报" in website.name:
+    if not need_selenium:
-        resp.encoding = 'utf-8'
+        if "人民网" in website.name or "人民日报" in website.name:
-    elif "新华网" in website.name:
+            resp.encoding = 'utf-8'
-        resp.encoding = 'utf-8'
+        elif "新华网" in website.name:
-    elif "央视" in website.name or "CCTV" in website.name:
+            resp.encoding = 'utf-8'
-        resp.encoding = 'utf-8'
+        elif "央视" in website.name or "CCTV" in website.name:
-    elif "农民日报" in website.name or "farmer" in website.name:
+            resp.encoding = 'utf-8'
-        resp.encoding = 'utf-8'
+        elif "农民日报" in website.name or "farmer" in website.name:
-        # 尝试其他编码
+            resp.encoding = 'utf-8'
-        if '' in resp.text or len(resp.text) < 1000:
+            # 尝试其他编码
-            resp.encoding = 'gbk'
+            if '' in resp.text or len(resp.text) < 1000:
-        # 进一步尝试其他编码
+                resp.encoding = 'gbk'
-        if '' in resp.text or '' in resp.text:
+            # 进一步尝试其他编码
-            resp.encoding = 'gb2312'
+            if '' in resp.text or '' in resp.text:
-    else:
+                resp.encoding = 'gb2312'
-        resp.encoding = 'utf-8'
+            # 如果还是有问题，尝试更多编码
-
+            if '' in resp.text or '' in resp.text:
-    soup = BeautifulSoup(resp.text, "html.parser")
+                resp.encoding = 'utf-8-sig'
            # 最后尝试
            if '' in resp.text or '' in resp.text:
                resp.encoding = 'big5'
        else:
            resp.encoding = 'utf-8'
    # 处理不同网站的文章结构
    if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name:
@@ -777,6 +877,11 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
        # 针对中国纪检监察报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
        content_tag = (
                soup.find("div", class_="content") or
                soup.find("div", class_="article-content") or
@@ -784,7 +889,10 @@ def process_article(url, website):
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
                soup.find("div", class_="article") or
-                soup.find("div", class_="article-body")
+                soup.find("div", class_="article-body") or
                soup.find("div", class_="main") or
                soup.find("div", class_="detail") or
                soup.find("div", class_="article_yt")  # 中国纪检监察报特有内容容器
        )
    elif "中国新闻社" in website.name or "chinanews" in website.name:
        # 中国新闻社的文章结构处理 - 修复爬取非文章部分问题
@@ -847,7 +955,13 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
        # 针对中国青年报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
        content_tag = (
                soup.find("div", class_="main") or  # 中国青年报特有内容容器
                soup.find("div", class_="content") or
                soup.find("div", class_="article-content") or
                soup.find("div", id="content") or
@@ -863,7 +977,14 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
        # 针对中国妇女报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
        content_tag = (
                soup.find("div", class_="main") or  # 中国妇女报特有内容容器
                soup.find("div", class_="news") or  # 中国妇女报特有内容容器
                soup.find("div", class_="content") or
                soup.find("div", class_="article-content") or
                soup.find("div", id="content") or
@@ -880,6 +1001,11 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
        # 针对法治日报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
        content_tag = (
                soup.find("div", class_="content-two") or  # 优先查找content-two类
                soup.find("div", class_="article-content") or  # 法治日报特有内容容器
@@ -888,7 +1014,10 @@ def process_article(url, website):
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
                soup.find("div", class_="article") or
-                soup.find("div", class_="article-body")
+                soup.find("div", class_="article-body") or
                soup.find("div", class_="article-detail") or  # 法治日报特有内容容器
                soup.find("div", class_="detail-content") or  # 法治日报特有内容容器
                soup.find("div", class_="article-text")  # 法治日报特有内容容器
        )
        # 针对法治日报的特殊处理，清理内容中的无关元素
@@ -935,6 +1064,10 @@ def process_article(url, website):
            title_text = title_tag.get_text(strip=True)
            if title_text and any(char in title_text for char in ['', '', '']):
                title_tag = soup.find("title")
        # 针对农民日报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
        content_tag = (
                soup.find("div", class_="detailCon") or  # 农民日报特有内容容器
                soup.find("div", class_="article-content") or
@@ -983,6 +1116,17 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
        # 针对学习强国的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
        # 针对学习强国的特殊处理，如果标题太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if title_text and len(title_text) < 10:
                title_tag = soup.find("title")
        content_tag = (
                soup.find("div", class_="content") or
                soup.find("div", class_="article-content") or
@@ -990,7 +1134,14 @@ def process_article(url, website):
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
                soup.find("div", class_="article") or
-                soup.find("div", class_="article-body")
+                soup.find("div", class_="article-body") or
                soup.find("div", class_="main") or
                soup.find("div", class_="detail") or
                soup.find("div", class_="lgpage-detail") or  # 学习强国特有内容容器
                soup.find("div", class_="detail-content") or  # 学习强国特有内容容器
                soup.find("div", class_="article-detail") or  # 学习强国特有内容容器
                soup.find("div", class_="xuexi") or  # 学习强国特有内容容器
                soup.find("div", class_="kNews")  # 学习强国特有内容容器
        )
    elif "旗帜网" in website.name or "qizhiwang" in website.name:
        # 旗帜网的文章结构处理 - 修复不保存正文和图片问题
@@ -1002,6 +1153,22 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
        # 针对旗帜网的特殊处理，如果标题为空或太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if not title_text or len(title_text) < 5:
                title_tag = soup.find("title")
        # 针对旗帜网的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
        # 针对旗帜网的特殊处理，如果标题太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if title_text and len(title_text) < 10:
                title_tag = soup.find("title")
        content_tag = (
                soup.find("div", class_="w1200 flag-text-con clearfix") or  # 旗帜网特有内容容器
                soup.find("div", class_="article-content") or  # 旗帜网特有内容容器
@@ -1065,14 +1232,23 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
        # 针对中国网的特殊处理，如果标题为空或太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if not title_text or len(title_text) < 5:
                title_tag = soup.find("title")
        content_tag = (
-                soup.find("div", class_="main") or  # 中国网特有内容容器
+                soup.find("div", class_="article") or  # 中国网特有内容容器
                soup.find("div", class_="main") or
                soup.find("div", class_="textBox") or  # 中国网直播特有内容容器
                soup.find("div", class_="artInfo") or  # 中国网直播特有内容容器
                soup.find("div", class_="article-content") or
                soup.find("div", class_="content") or
                soup.find("div", id="content") or
                soup.find("div", class_="text") or
                soup.find("div", class_="main-content") or
                soup.find("div", class_="article") or
                soup.find("div", class_="article-body")
        )