Add Support full site

2025-08-11 14:33:32 +08:00
parent 969d46b070
commit 6d80326a4e
2 changed files with 76 additions and 30 deletions
--- a/core/management/commands/crawl_full_site.py
+++ b/core/management/commands/crawl_full_site.py
@@ -0,0 +1,20 @@
+# core/management/commands/crawl_full_site.py
+from django.core.management.base import BaseCommand
+from core.models import Website
+from core.utils import full_site_crawler
+
+class Command(BaseCommand):
+    help = "全站递归爬取 www.news.cn"
+
+    def handle(self, *args, **kwargs):
+        website, created = Website.objects.get_or_create(
+            name="www.news.cn",
+            defaults={
+                'article_list_url': 'https://www.news.cn/',
+                'article_selector': 'a'
+            }
+        )
+        start_url = "https://www.news.cn/"
+        self.stdout.write(f"开始全站爬取: {start_url}")
+        full_site_crawler(start_url, website, max_pages=500)
+        self.stdout.write("爬取完成")
--- a/core/utils.py
+++ b/core/utils.py
@@ -1,7 +1,9 @@
+# core/utils.py
 import os
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse
+from collections import deque
 from django.utils import timezone
 from django.conf import settings
 from core.models import Article
@@ -27,11 +29,13 @@ def download_media(url, save_dir):

    with open(filepath, "wb") as f:
        f.write(resp.content)
+    return filepath

-    # 返回相对路径，方便存数据库和展示
-    return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/")
+def process_article(url, website):
+    if Article.objects.filter(url=url).exists():
+        print(f"文章已存在，跳过: {url}")
+        return

-def crawl_xinhua_article(url, website):
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    resp.encoding = 'utf-8'
@@ -42,7 +46,7 @@ def crawl_xinhua_article(url, website):

    content_tag = soup.find("span", id="detailContent")
    if not content_tag:
-        print(f"没有找到正文，跳过文章: {url}")
+        print("没有找到正文，跳过:", url)
        return

    imgs = content_tag.find_all("img")
@@ -56,22 +60,16 @@ def crawl_xinhua_article(url, website):
        src = img.get("src")
        if not src:
            continue
-
-        # 这里用文章URL作为基准拼接相对路径，避免错误
        if not src.startswith("http"):
            src = urljoin(url, src)
-
-        local_rel_path = download_media(src, save_dir)
-        if local_rel_path:
-            img["src"] = settings.MEDIA_URL + local_rel_path
-            media_files.append(local_rel_path)
+        local_path = download_media(src, save_dir)
+        if local_path:
+            rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
+            img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
+            media_files.append(rel_path.replace("\\", "/"))

    content_html = str(content_tag)

-    if Article.objects.filter(url=url).exists():
-        print(f"文章已存在，跳过: {url}")
-        return
-
    article = Article.objects.create(
        website=website,
        title=title,
@@ -82,22 +80,50 @@ def crawl_xinhua_article(url, website):
    )
    print(f"已保存文章及图片：{title}")

-def crawl_xinhua_list(list_url, website):
+def is_valid_url(url, base_netloc):
+    try:
+        parsed = urlparse(url)
+        if parsed.scheme not in ("http", "https"):
+            return False
+        if parsed.netloc != base_netloc:
+            return False
+        return True
+    except Exception:
+        return False
+
+def full_site_crawler(start_url, website, max_pages=1000):
    headers = {"User-Agent": "Mozilla/5.0"}
-    resp = requests.get(list_url, headers=headers)
-    resp.encoding = 'utf-8'
-    soup = BeautifulSoup(resp.text, "html.parser")
+    visited = set()
+    queue = deque([start_url])

-    article_urls = set()
-    for link in soup.find_all("a", href=True):
-        href = link["href"]
-        if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"):
-            article_urls.add(href)
+    base_netloc = urlparse(start_url).netloc

-    print(f"在列表页找到 {len(article_urls)} 篇文章链接")
-    for url in article_urls:
-        print("文章链接:", url)
+    pages_crawled = 0

-    from core.utils import crawl_xinhua_article
-    for article_url in article_urls:
-        crawl_xinhua_article(article_url, website)
+    while queue and pages_crawled < max_pages:
+        url = queue.popleft()
+        if url in visited:
+            continue
+        print(f"正在爬取：{url}")
+        visited.add(url)
+
+        try:
+            resp = requests.get(url, headers=headers, timeout=15)
+            resp.raise_for_status()
+        except Exception as e:
+            print(f"请求失败：{url}，错误：{e}")
+            continue
+
+        resp.encoding = 'utf-8'
+        soup = BeautifulSoup(resp.text, "html.parser")
+
+        # 如果是文章页面，则调用文章处理
+        if soup.find("span", id="detailContent"):
+            process_article(url, website)
+            pages_crawled += 1
+
+        # 扩展队列，发现新链接
+        for link in soup.find_all("a", href=True):
+            href = urljoin(url, link["href"])
+            if href not in visited and is_valid_url(href, base_netloc):
+                queue.append(href)