fix bugs and support all platform

2025-08-15 08:33:47 +08:00
parent e82b85f4dd
commit 4945b4c6b0
36 changed files with 2296 additions and 992 deletions
--- a/core/utils.py
+++ b/core/utils.py
@@ -26,8 +26,9 @@ def get_selenium_driver():
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
-        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
-        
+        chrome_options.add_argument(
+            "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
+
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        return driver
@@ -35,6 +36,7 @@ def get_selenium_driver():
        print(f"创建Selenium WebDriver失败: {e}")
        return None

+
 def get_page_with_selenium(url, website_name):
    """使用Selenium获取动态加载的页面内容"""
    driver = None
@@ -42,17 +44,17 @@ def get_page_with_selenium(url, website_name):
        driver = get_selenium_driver()
        if not driver:
            return None
-        
+
        print(f"使用Selenium加载页面: {url}")
        driver.get(url)
-        
+
        # 等待页面加载完成
        wait_time = 10
        if "学习强国" in website_name:
            wait_time = 15  # 学习强国需要更长时间
        elif "法治日报" in website_name:
            wait_time = 12  # 法治日报需要较长时间
-        
+
        # 等待页面主要内容加载
        try:
            WebDriverWait(driver, wait_time).until(
@@ -60,14 +62,14 @@ def get_page_with_selenium(url, website_name):
            )
        except:
            print(f"等待页面加载超时: {url}")
-        
+
        # 额外等待时间确保动态内容加载完成
        time.sleep(3)
-        
+
        # 获取页面源码
        page_source = driver.page_source
        return page_source
-        
+
    except Exception as e:
        print(f"Selenium获取页面失败: {url}, 错误: {e}")
        return None
@@ -78,6 +80,7 @@ def get_page_with_selenium(url, website_name):
            except:
                pass

+
 def download_media(url, save_dir):
    try:
        # 添加请求头以避免403 Forbidden错误
@@ -236,7 +239,7 @@ def process_article(url, website):
    need_selenium = False
    if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]):
        need_selenium = True
-    
+
    try:
        if need_selenium:
            # 使用Selenium获取动态加载的内容
@@ -244,28 +247,28 @@ def process_article(url, website):
            if not page_source:
                print(f"Selenium获取页面失败：{url}")
                return
-            
+
            # 检查页面内容是否过短
            min_length = 100 if "法治日报" in website.name else 300
            if len(page_source) < min_length:
                print(f"页面内容过短，可能是重定向页面：{url}")
                return
-            
+
            # 创建BeautifulSoup对象
            soup = BeautifulSoup(page_source, "html.parser")
        else:
            # 使用requests获取静态内容
            resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
            resp.raise_for_status()
-            
+
            # 检查是否是重定向页面
            if len(resp.text) < 300:
                print(f"页面内容过短，可能是重定向页面：{url}")
                return
-            
+
            # 创建BeautifulSoup对象
            soup = BeautifulSoup(resp.text, "html.parser")
-            
+
    except Exception as e:
        print(f"请求失败：{url}，错误：{e}")
        return
@@ -353,7 +356,7 @@ def process_article(url, website):
                    heading_text = heading.get_text(strip=True)
                    if title_text in heading_text or heading_text in title_text:
                        heading.decompose()
-                
+
                # 移除class包含title的元素
                for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
                    title_element_text = title_element.get_text(strip=True)
@@ -489,13 +492,13 @@ def process_article(url, website):
                soup.find("p", class_="title") or
                soup.find("title")
        )
-        
+
        # 针对求是的特殊处理，如果标题为空或太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if not title_text or len(title_text) < 5:
                title_tag = soup.find("title")
-        
+
        # 针对求是的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
@@ -522,7 +525,7 @@ def process_article(url, website):
                            # 如果 strong 在正文前两段内，就删除
                            if parent_p in content_tag.find_all("p")[:2]:
                                strong_tag.decompose()
-                    
+
                    # 移除h1、h2、h3标题元素中的重复标题
                    for heading in content_tag.find_all(["h1", "h2", "h3"]):
                        heading_text = heading.get_text(strip=True)
@@ -530,11 +533,12 @@ def process_article(url, website):
                            # 确保不删除title_tag本身
                            if heading != title_tag:
                                heading.decompose()
-                    
+
                    # 移除class包含title的元素
                    for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
                        title_element_text = title_element.get_text(strip=True)
-                        if title_element_text and (title_text in title_element_text or title_element_text in title_text):
+                        if title_element_text and (
+                                title_text in title_element_text or title_element_text in title_text):
                            # 确保不删除title_tag本身
                            if title_element != title_tag:
                                title_element.decompose()
@@ -583,7 +587,7 @@ def process_article(url, website):
                soup.find("h2") or  # 解放军报使用h2标签作为标题
                soup.find("title")
        )
-        
+
        # 针对解放军报的特殊处理，如果标题为空或太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
@@ -606,34 +610,34 @@ def process_article(url, website):
            # 移除面包屑导航
            for breadcrumb in content_tag.find_all("ol", class_="breadcrumb"):
                breadcrumb.decompose()
-            
+
            # 移除分享相关元素
            for share_element in content_tag.find_all("div", class_="share-custom"):
                share_element.decompose()
-            
+
            # 移除作者信息段落
            for author_p in content_tag.find_all("p"):
                text = author_p.get_text(strip=True)
                if "来源：" in text or "作者：" in text or "责任编辑：" in text or "发布：" in text:
                    author_p.decompose()
-            
+
            # 移除进度条
            for progress in content_tag.find_all("div", class_="progress-bar"):
                progress.decompose()
-            
+
            # 移除播放器
            for player in content_tag.find_all("div", class_="player"):
                player.decompose()
-            
+
            # 移除媒体URL容器
            for media in content_tag.find_all("div", id="mediaurl"):
                media.decompose()
-            
+
            # 移除新闻列表（但保留其中的内容）
            for news_list in content_tag.find_all("ul", id="main-news-list"):
                # 不删除整个ul，而是unwrap它，保留其中的内容
                news_list.unwrap()
-            
+
            # 移除编辑信息
            for editor_element in content_tag.find_all("div", class_="editor"):
                editor_element.decompose()
@@ -744,13 +748,13 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对工人日报的特殊处理，如果标题为空或太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if not title_text or len(title_text) < 5:
                title_tag = soup.find("title")
-        
+
        # 进一步处理：如果h1标题包含太多无关信息，尝试从title标签提取更简洁的标题
        if title_tag and title_tag.name == 'h1':
            title_text = title_tag.get_text(strip=True)
@@ -877,11 +881,11 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对中国纪检监察报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
-        
+
        content_tag = (
                soup.find("div", class_="content") or
                soup.find("div", class_="article-content") or
@@ -955,11 +959,11 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对中国青年报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
-        
+
        content_tag = (
                soup.find("div", class_="main") or  # 中国青年报特有内容容器
                soup.find("div", class_="content") or
@@ -977,11 +981,11 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对中国妇女报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
-        
+
        content_tag = (
                soup.find("div", class_="main") or  # 中国妇女报特有内容容器
                soup.find("div", class_="news") or  # 中国妇女报特有内容容器
@@ -1001,11 +1005,11 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对法治日报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
-        
+
        content_tag = (
                soup.find("div", class_="content-two") or  # 优先查找content-two类
                soup.find("div", class_="article-content") or  # 法治日报特有内容容器
@@ -1058,13 +1062,13 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对农民日报的特殊处理，如果标题出现乱码，尝试从title标签提取
        if title_tag and title_tag.name == 'h1':
            title_text = title_tag.get_text(strip=True)
            if title_text and any(char in title_text for char in ['', '', '']):
                title_tag = soup.find("title")
-        
+
        # 针对农民日报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
@@ -1078,7 +1082,7 @@ def process_article(url, website):
                soup.find("div", class_="article") or
                soup.find("div", class_="article-body")
        )
-        
+
        # 针对农民日报的特殊处理，如果找到多个detailCon，选择内容最长的那个
        if content_tag and content_tag.get('class') and 'detailCon' in content_tag.get('class', []):
            detail_cons = soup.find_all("div", class_="detailCon")
@@ -1116,17 +1120,17 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对学习强国的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
-        
+
        # 针对学习强国的特殊处理，如果标题太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if title_text and len(title_text) < 10:
                title_tag = soup.find("title")
-        
+
        content_tag = (
                soup.find("div", class_="content") or
                soup.find("div", class_="article-content") or
@@ -1153,17 +1157,17 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对旗帜网的特殊处理，如果标题为空或太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if not title_text or len(title_text) < 5:
                title_tag = soup.find("title")
-        
+
        # 针对旗帜网的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
-        
+
        # 针对旗帜网的特殊处理，如果标题太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
@@ -1232,13 +1236,13 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对中国网的特殊处理，如果标题为空或太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if not title_text or len(title_text) < 5:
                title_tag = soup.find("title")
-        
+
        content_tag = (
                soup.find("div", class_="article") or  # 中国网特有内容容器
                soup.find("div", class_="main") or
@@ -1281,7 +1285,7 @@ def process_article(url, website):
    # 最终标题处理 - 只有在没有网站特定处理时才使用默认处理
    if not title_tag:
        title_tag = soup.find("h1") or soup.find("title")
-    
+
    title = title_tag.get_text(strip=True) if title_tag else "无标题"

    # 对标题进行额外处理，去除可能的多余空白字符
@@ -1564,7 +1568,7 @@ def full_site_crawler(start_url, website, max_pages=1000):
                    ("/content/" in path) or
                    (path.startswith("/detail/") and len(path) > 10)
            )
-            
+
            # 排除列表页面
            if "/index.html" in path or path.endswith("/"):
                is_article_page = False