fix bugs

2025-08-13 21:35:11 +08:00
parent c618528a0a
commit 31d0525cd0
10 changed files with 243 additions and 897 deletions
--- a/core/utils.py
+++ b/core/utils.py
@@ -107,6 +107,17 @@ def process_article(url, website):
                soup.find("div", id="content") or
                soup.find("div", class_="mainBody")
        )
+    elif website.name == "人民日报":
+        # 人民日报网站的文章结构处理
+        title_tag = soup.find("h1") or soup.find("title")
+        # 查找主要内容区域
+        content_tag = (
+                soup.find("div", class_="content") or
+                soup.find("div", class_="article-content") or
+                soup.find("div", id="content") or
+                soup.find("div", class_="text") or
+                soup.find("section", class_="content")
+        )
    else:
        # 默认处理方式
        title_tag = soup.find("h1") or soup.find("title")
@@ -256,6 +267,24 @@ def full_site_crawler(start_url, website, max_pages=1000):
                    ("/xinwen/" in path) or
                    ("/huoban/" in path)
            )
+        elif website.name == "人民日报":
+            # 人民日报的文章页面判断逻辑
+            parsed_url = urlparse(url)
+            path = parsed_url.path
+            # 修改: 增加更准确的文章页面判断逻辑
+            is_article_page = (
+                    (soup.find("div", class_="content") is not None and 
+                     soup.find("h1") is not None) or
+                    soup.find("div", class_="article-content") is not None or
+                    (soup.find("div", id="content") is not None and 
+                     soup.find("h1") is not None) or
+                    soup.find("div", class_="text") is not None or
+                    soup.find("section", class_="content") is not None or
+                    ("/article/" in path) or
+                    (path.startswith("/detail/") and len(path) > 10) or
+                    # 增加对peopleapp.com特定文章路径的判断
+                    ("/dynamic/" in path and "article" in path)
+            )
        else:
            # 默认判断逻辑
            is_article_page = (
@@ -271,5 +300,18 @@ def full_site_crawler(start_url, website, max_pages=1000):
        # 扩展队列，发现新链接
        for link in soup.find_all("a", href=True):
            href = urljoin(url, link["href"])
-            if href not in visited and is_valid_url(href, base_netloc):
+            # 对于人民日报网站，我们扩展链接发现逻辑
+            if website.name == "人民日报":
+                # 允许爬取以https://www.peopleapp.com/开头的链接
+                if href.startswith("https://www.peopleapp.com/") and href not in visited:
+                    # 增加对文章链接的识别
+                    parsed_href = urlparse(href)
+                    href_path = parsed_href.path
+                    # 添加更多可能的文章链接模式
+                    if ("/article/" in href_path or 
+                        href_path.startswith("/detail/") or 
+                        ("/dynamic/" in href_path and "article" in href_path) or
+                        href_path.count("/") > 2):  # 更深层的页面可能是文章页
+                        queue.append(href)
+            elif href not in visited and is_valid_url(href, base_netloc):
                queue.append(href)