This commit is contained in:
2025-08-13 21:35:11 +08:00
parent c618528a0a
commit 31d0525cd0
10 changed files with 243 additions and 897 deletions

View File

@@ -107,6 +107,17 @@ def process_article(url, website):
soup.find("div", id="content") or
soup.find("div", class_="mainBody")
)
elif website.name == "人民日报":
# 人民日报网站的文章结构处理
title_tag = soup.find("h1") or soup.find("title")
# 查找主要内容区域
content_tag = (
soup.find("div", class_="content") or
soup.find("div", class_="article-content") or
soup.find("div", id="content") or
soup.find("div", class_="text") or
soup.find("section", class_="content")
)
else:
# 默认处理方式
title_tag = soup.find("h1") or soup.find("title")
@@ -256,6 +267,24 @@ def full_site_crawler(start_url, website, max_pages=1000):
("/xinwen/" in path) or
("/huoban/" in path)
)
elif website.name == "人民日报":
# 人民日报的文章页面判断逻辑
parsed_url = urlparse(url)
path = parsed_url.path
# 修改: 增加更准确的文章页面判断逻辑
is_article_page = (
(soup.find("div", class_="content") is not None and
soup.find("h1") is not None) or
soup.find("div", class_="article-content") is not None or
(soup.find("div", id="content") is not None and
soup.find("h1") is not None) or
soup.find("div", class_="text") is not None or
soup.find("section", class_="content") is not None or
("/article/" in path) or
(path.startswith("/detail/") and len(path) > 10) or
# 增加对peopleapp.com特定文章路径的判断
("/dynamic/" in path and "article" in path)
)
else:
# 默认判断逻辑
is_article_page = (
@@ -271,5 +300,18 @@ def full_site_crawler(start_url, website, max_pages=1000):
# 扩展队列,发现新链接
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
if href not in visited and is_valid_url(href, base_netloc):
# 对于人民日报网站,我们扩展链接发现逻辑
if website.name == "人民日报":
# 允许爬取以https://www.peopleapp.com/开头的链接
if href.startswith("https://www.peopleapp.com/") and href not in visited:
# 增加对文章链接的识别
parsed_href = urlparse(href)
href_path = parsed_href.path
# 添加更多可能的文章链接模式
if ("/article/" in href_path or
href_path.startswith("/detail/") or
("/dynamic/" in href_path and "article" in href_path) or
href_path.count("/") > 2): # 更深层的页面可能是文章页
queue.append(href)
elif href not in visited and is_valid_url(href, base_netloc):
queue.append(href)