diff --git a/core/management/commands/crawl_cngov.py b/core/management/commands/crawl_cngov.py index 4e1e4de..40d9146 100644 --- a/core/management/commands/crawl_cngov.py +++ b/core/management/commands/crawl_cngov.py @@ -17,4 +17,4 @@ class Command(BaseCommand): start_url = "https://www.gov.cn/" self.stdout.write(f"开始全站爬取: {start_url}") full_site_crawler(start_url, website, max_pages=500) - self.stdout.write("爬取完成") \ No newline at end of file + self.stdout.write("爬取完成") diff --git a/core/utils.py b/core/utils.py index b6688fd..9bc3ba2 100644 --- a/core/utils.py +++ b/core/utils.py @@ -95,11 +95,11 @@ def process_article(url, website): title_tag = soup.find("h1") or soup.find("title") # 查找主要内容区域,通常在.mainBody或content中 content_tag = ( - soup.find("div", class_="pages_content") or - soup.find("div", class_="article_con") or - soup.find("div", class_="content") or - soup.find("div", id="content") or - soup.find("div", class_="mainBody") + soup.find("div", class_="pages_content") or + soup.find("div", class_="article_con") or + soup.find("div", class_="content") or + soup.find("div", id="content") or + soup.find("div", class_="mainBody") ) else: # 默认处理方式 @@ -205,14 +205,14 @@ def full_site_crawler(start_url, website, max_pages=1000): parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( - soup.find("div", class_="pages_content") is not None or - soup.find("div", class_="article_con") is not None or - soup.find("div", class_="content") is not None or - soup.find("div", id="content") is not None or - soup.find("div", class_="mainBody") is not None or - ("/zhengce/" in path) or - ("/xinwen/" in path) or - ("/huoban/" in path) + soup.find("div", class_="pages_content") is not None or + soup.find("div", class_="article_con") is not None or + soup.find("div", class_="content") is not None or + soup.find("div", id="content") is not None or + soup.find("div", class_="mainBody") is not None or + ("/zhengce/" in path) or + ("/xinwen/" in path) or + ("/huoban/" in path) ) else: # 默认判断逻辑 @@ -230,4 +230,4 @@ def full_site_crawler(start_url, website, max_pages=1000): for link in soup.find_all("a", href=True): href = urljoin(url, link["href"]) if href not in visited and is_valid_url(href, base_netloc): - queue.append(href) \ No newline at end of file + queue.append(href)