fix bugs
This commit is contained in:
@@ -107,6 +107,17 @@ def process_article(url, website):
|
||||
soup.find("div", id="content") or
|
||||
soup.find("div", class_="mainBody")
|
||||
)
|
||||
elif website.name == "人民日报":
|
||||
# 人民日报网站的文章结构处理
|
||||
title_tag = soup.find("h1") or soup.find("title")
|
||||
# 查找主要内容区域
|
||||
content_tag = (
|
||||
soup.find("div", class_="content") or
|
||||
soup.find("div", class_="article-content") or
|
||||
soup.find("div", id="content") or
|
||||
soup.find("div", class_="text") or
|
||||
soup.find("section", class_="content")
|
||||
)
|
||||
else:
|
||||
# 默认处理方式
|
||||
title_tag = soup.find("h1") or soup.find("title")
|
||||
@@ -256,6 +267,24 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
||||
("/xinwen/" in path) or
|
||||
("/huoban/" in path)
|
||||
)
|
||||
elif website.name == "人民日报":
|
||||
# 人民日报的文章页面判断逻辑
|
||||
parsed_url = urlparse(url)
|
||||
path = parsed_url.path
|
||||
# 修改: 增加更准确的文章页面判断逻辑
|
||||
is_article_page = (
|
||||
(soup.find("div", class_="content") is not None and
|
||||
soup.find("h1") is not None) or
|
||||
soup.find("div", class_="article-content") is not None or
|
||||
(soup.find("div", id="content") is not None and
|
||||
soup.find("h1") is not None) or
|
||||
soup.find("div", class_="text") is not None or
|
||||
soup.find("section", class_="content") is not None or
|
||||
("/article/" in path) or
|
||||
(path.startswith("/detail/") and len(path) > 10) or
|
||||
# 增加对peopleapp.com特定文章路径的判断
|
||||
("/dynamic/" in path and "article" in path)
|
||||
)
|
||||
else:
|
||||
# 默认判断逻辑
|
||||
is_article_page = (
|
||||
@@ -271,5 +300,18 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
||||
# 扩展队列,发现新链接
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = urljoin(url, link["href"])
|
||||
if href not in visited and is_valid_url(href, base_netloc):
|
||||
# 对于人民日报网站,我们扩展链接发现逻辑
|
||||
if website.name == "人民日报":
|
||||
# 允许爬取以https://www.peopleapp.com/开头的链接
|
||||
if href.startswith("https://www.peopleapp.com/") and href not in visited:
|
||||
# 增加对文章链接的识别
|
||||
parsed_href = urlparse(href)
|
||||
href_path = parsed_href.path
|
||||
# 添加更多可能的文章链接模式
|
||||
if ("/article/" in href_path or
|
||||
href_path.startswith("/detail/") or
|
||||
("/dynamic/" in href_path and "article" in href_path) or
|
||||
href_path.count("/") > 2): # 更深层的页面可能是文章页
|
||||
queue.append(href)
|
||||
elif href not in visited and is_valid_url(href, base_netloc):
|
||||
queue.append(href)
|
||||
|
||||
Reference in New Issue
Block a user