From 969d46b07026ef2b667857709e43f49bbb4b9e8d Mon Sep 17 00:00:00 2001
From: yuangyaa <yuangyaa@163.com>
Date: Mon, 11 Aug 2025 13:52:52 +0800
Subject: [PATCH] Add Find weburl

---
 core/management/commands/crawl_xinhua.py | 19 ++++-----
 core/utils.py                            | 50 +++++++++++++++++-------
 core/views.py                            |  2 +-
 3 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/core/management/commands/crawl_xinhua.py b/core/management/commands/crawl_xinhua.py
index b810425..559f77f 100644
--- a/core/management/commands/crawl_xinhua.py
+++ b/core/management/commands/crawl_xinhua.py
@@ -1,21 +1,18 @@
 from django.core.management.base import BaseCommand
 from core.models import Website
-from core.utils import crawl_xinhua_article
+from core.utils import crawl_xinhua_list
 
 class Command(BaseCommand):
-    help = '爬取新华网文章示例'
+    help = '批量爬取新华网文章'
 
     def handle(self, *args, **options):
-        website_name = "新华网"
+        list_url = "https://www.news.cn/legal/index.html"
         try:
-            website = Website.objects.get(name=website_name)
+            website = Website.objects.get(base_url="https://www.news.cn/")
         except Website.DoesNotExist:
-            self.stdout.write(self.style.ERROR(f"网站 '{website_name}' 不存在，请先后台创建"))
+            self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在，请先后台添加"))
             return
 
-        urls = [
-            "https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html",
-        ]
-
-        for url in urls:
-            crawl_xinhua_article(url, website)
+        self.stdout.write(f"开始爬取文章列表页: {list_url}")
+        crawl_xinhua_list(list_url, website)
+        self.stdout.write(self.style.SUCCESS("批量爬取完成"))
diff --git a/core/utils.py b/core/utils.py
index 02fe2ac..d7ca2f6 100644
--- a/core/utils.py
+++ b/core/utils.py
@@ -1,9 +1,9 @@
 import os
 import requests
 from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
 from django.utils import timezone
 from django.conf import settings
-from urllib.parse import urljoin
 from core.models import Article
 
 def download_media(url, save_dir):
@@ -27,12 +27,12 @@ def download_media(url, save_dir):
 
     with open(filepath, "wb") as f:
         f.write(resp.content)
-    return filepath
+
+    # 返回相对路径，方便存数据库和展示
+    return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/")
 
 def crawl_xinhua_article(url, website):
-    headers = {
-        "User-Agent": "Mozilla/5.0"
-    }
+    headers = {"User-Agent": "Mozilla/5.0"}
     resp = requests.get(url, headers=headers)
     resp.encoding = 'utf-8'
     soup = BeautifulSoup(resp.text, "html.parser")
@@ -42,7 +42,7 @@ def crawl_xinhua_article(url, website):
 
     content_tag = soup.find("span", id="detailContent")
     if not content_tag:
-        print("没有找到正文")
+        print(f"没有找到正文，跳过文章: {url}")
         return
 
     imgs = content_tag.find_all("img")
@@ -54,17 +54,17 @@ def crawl_xinhua_article(url, website):
 
     for img in imgs:
         src = img.get("src")
-        print("原始图片 src =", src)
         if not src:
             continue
-        # 用文章页面url作为base拼接，确保拼出完整图片链接
-        src = urljoin(url, src)
-        print("拼接后图片 URL =", src)
-        local_path = download_media(src, save_dir)
-        if local_path:
-            rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
-            img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
-            media_files.append(rel_path.replace("\\", "/"))
+
+        # 这里用文章URL作为基准拼接相对路径，避免错误
+        if not src.startswith("http"):
+            src = urljoin(url, src)
+
+        local_rel_path = download_media(src, save_dir)
+        if local_rel_path:
+            img["src"] = settings.MEDIA_URL + local_rel_path
+            media_files.append(local_rel_path)
 
     content_html = str(content_tag)
 
@@ -81,3 +81,23 @@ def crawl_xinhua_article(url, website):
         media_files=media_files
     )
     print(f"已保存文章及图片：{title}")
+
+def crawl_xinhua_list(list_url, website):
+    headers = {"User-Agent": "Mozilla/5.0"}
+    resp = requests.get(list_url, headers=headers)
+    resp.encoding = 'utf-8'
+    soup = BeautifulSoup(resp.text, "html.parser")
+
+    article_urls = set()
+    for link in soup.find_all("a", href=True):
+        href = link["href"]
+        if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"):
+            article_urls.add(href)
+
+    print(f"在列表页找到 {len(article_urls)} 篇文章链接")
+    for url in article_urls:
+        print("文章链接:", url)
+
+    from core.utils import crawl_xinhua_article
+    for article_url in article_urls:
+        crawl_xinhua_article(article_url, website)
diff --git a/core/views.py b/core/views.py
index 956c854..d1f39b5 100644
--- a/core/views.py
+++ b/core/views.py
@@ -7,7 +7,7 @@ def article_list(request):
     显示文章列表的视图函数
     """
     articles = Article.objects.all().order_by('-created_at')
-    paginator = Paginator(articles, 10)  # 每页显示10篇文章
+    paginator = Paginator(articles, 20)  # 每页显示10篇文章
     
     page_number = request.GET.get('page')
     page_obj = paginator.get_page(page_number)