diff --git a/core/management/commands/crawl_full_site.py b/core/management/commands/crawl_full_site.py new file mode 100644 index 0000000..1c25982 --- /dev/null +++ b/core/management/commands/crawl_full_site.py @@ -0,0 +1,20 @@ +# core/management/commands/crawl_full_site.py +from django.core.management.base import BaseCommand +from core.models import Website +from core.utils import full_site_crawler + +class Command(BaseCommand): + help = "全站递归爬取 www.news.cn" + + def handle(self, *args, **kwargs): + website, created = Website.objects.get_or_create( + name="www.news.cn", + defaults={ + 'article_list_url': 'https://www.news.cn/', + 'article_selector': 'a' + } + ) + start_url = "https://www.news.cn/" + self.stdout.write(f"开始全站爬取: {start_url}") + full_site_crawler(start_url, website, max_pages=500) + self.stdout.write("爬取完成") \ No newline at end of file diff --git a/core/utils.py b/core/utils.py index d7ca2f6..0b50fee 100644 --- a/core/utils.py +++ b/core/utils.py @@ -1,7 +1,9 @@ +# core/utils.py import os import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse +from collections import deque from django.utils import timezone from django.conf import settings from core.models import Article @@ -27,11 +29,13 @@ def download_media(url, save_dir): with open(filepath, "wb") as f: f.write(resp.content) + return filepath - # 返回相对路径,方便存数据库和展示 - return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/") +def process_article(url, website): + if Article.objects.filter(url=url).exists(): + print(f"文章已存在,跳过: {url}") + return -def crawl_xinhua_article(url, website): headers = {"User-Agent": "Mozilla/5.0"} resp = requests.get(url, headers=headers) resp.encoding = 'utf-8' @@ -42,7 +46,7 @@ def crawl_xinhua_article(url, website): content_tag = soup.find("span", id="detailContent") if not content_tag: - print(f"没有找到正文,跳过文章: {url}") + print("没有找到正文,跳过:", url) return imgs = content_tag.find_all("img") @@ -56,22 +60,16 @@ def crawl_xinhua_article(url, website): src = img.get("src") if not src: continue - - # 这里用文章URL作为基准拼接相对路径,避免错误 if not src.startswith("http"): src = urljoin(url, src) - - local_rel_path = download_media(src, save_dir) - if local_rel_path: - img["src"] = settings.MEDIA_URL + local_rel_path - media_files.append(local_rel_path) + local_path = download_media(src, save_dir) + if local_path: + rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT) + img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/") + media_files.append(rel_path.replace("\\", "/")) content_html = str(content_tag) - if Article.objects.filter(url=url).exists(): - print(f"文章已存在,跳过: {url}") - return - article = Article.objects.create( website=website, title=title, @@ -82,22 +80,50 @@ def crawl_xinhua_article(url, website): ) print(f"已保存文章及图片:{title}") -def crawl_xinhua_list(list_url, website): +def is_valid_url(url, base_netloc): + try: + parsed = urlparse(url) + if parsed.scheme not in ("http", "https"): + return False + if parsed.netloc != base_netloc: + return False + return True + except Exception: + return False + +def full_site_crawler(start_url, website, max_pages=1000): headers = {"User-Agent": "Mozilla/5.0"} - resp = requests.get(list_url, headers=headers) - resp.encoding = 'utf-8' - soup = BeautifulSoup(resp.text, "html.parser") + visited = set() + queue = deque([start_url]) - article_urls = set() - for link in soup.find_all("a", href=True): - href = link["href"] - if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"): - article_urls.add(href) + base_netloc = urlparse(start_url).netloc - print(f"在列表页找到 {len(article_urls)} 篇文章链接") - for url in article_urls: - print("文章链接:", url) + pages_crawled = 0 - from core.utils import crawl_xinhua_article - for article_url in article_urls: - crawl_xinhua_article(article_url, website) + while queue and pages_crawled < max_pages: + url = queue.popleft() + if url in visited: + continue + print(f"正在爬取:{url}") + visited.add(url) + + try: + resp = requests.get(url, headers=headers, timeout=15) + resp.raise_for_status() + except Exception as e: + print(f"请求失败:{url},错误:{e}") + continue + + resp.encoding = 'utf-8' + soup = BeautifulSoup(resp.text, "html.parser") + + # 如果是文章页面,则调用文章处理 + if soup.find("span", id="detailContent"): + process_article(url, website) + pages_crawled += 1 + + # 扩展队列,发现新链接 + for link in soup.find_all("a", href=True): + href = urljoin(url, link["href"]) + if href not in visited and is_valid_url(href, base_netloc): + queue.append(href)