From da1b8d98e460ed45ea4ce752a48d8f9a5981d310 Mon Sep 17 00:00:00 2001
From: yuangyaa <yuangyaa@163.com>
Date: Mon, 11 Aug 2025 13:28:32 +0800
Subject: [PATCH] Add download media

---
 .gitignore                               |  8 +++
 core/management/commands/__init__.py     |  0
 core/management/commands/crawl_xinhua.py |  2 -
 core/utils.py                            | 62 +++++++++++++++++++++---
 4 files changed, 63 insertions(+), 9 deletions(-)
 create mode 100644 core/management/commands/__init__.py

diff --git a/.gitignore b/.gitignore
index 575c1ad..dcc299e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -174,3 +174,11 @@ cython_debug/
 # PyPI configuration file
 .pypirc
 
+#####################################
+#
+#	jimmy.fang: ignore data/media/
+#
+#####################################
+
+date/media/
+
diff --git a/core/management/commands/__init__.py b/core/management/commands/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/core/management/commands/crawl_xinhua.py b/core/management/commands/crawl_xinhua.py
index 12b4a5f..b810425 100644
--- a/core/management/commands/crawl_xinhua.py
+++ b/core/management/commands/crawl_xinhua.py
@@ -6,7 +6,6 @@ class Command(BaseCommand):
     help = '爬取新华网文章示例'
 
     def handle(self, *args, **options):
-        # 假设你事先在后台建了“新华网”这个Website实例
         website_name = "新华网"
         try:
             website = Website.objects.get(name=website_name)
@@ -14,7 +13,6 @@ class Command(BaseCommand):
             self.stdout.write(self.style.ERROR(f"网站 '{website_name}' 不存在，请先后台创建"))
             return
 
-        # 这里写你想爬取的文章URL列表，可以循环多篇
         urls = [
             "https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html",
         ]
diff --git a/core/utils.py b/core/utils.py
index 2086378..02fe2ac 100644
--- a/core/utils.py
+++ b/core/utils.py
@@ -1,7 +1,33 @@
+import os
 import requests
 from bs4 import BeautifulSoup
 from django.utils import timezone
-from core.models import Website, Article
+from django.conf import settings
+from urllib.parse import urljoin
+from core.models import Article
+
+def download_media(url, save_dir):
+    try:
+        resp = requests.get(url, timeout=15)
+        resp.raise_for_status()
+    except Exception as e:
+        print(f"下载失败：{url}，错误：{e}")
+        return None
+
+    filename = url.split("/")[-1].split("?")[0]
+    os.makedirs(save_dir, exist_ok=True)
+    filepath = os.path.join(save_dir, filename)
+
+    base, ext = os.path.splitext(filename)
+    counter = 1
+    while os.path.exists(filepath):
+        filename = f"{base}_{counter}{ext}"
+        filepath = os.path.join(save_dir, filename)
+        counter += 1
+
+    with open(filepath, "wb") as f:
+        f.write(resp.content)
+    return filepath
 
 def crawl_xinhua_article(url, website):
     headers = {
@@ -11,16 +37,37 @@ def crawl_xinhua_article(url, website):
     resp.encoding = 'utf-8'
     soup = BeautifulSoup(resp.text, "html.parser")
 
-    # 提取标题
     title_tag = soup.find("span", class_="title")
     title = title_tag.get_text(strip=True) if title_tag else "无标题"
 
-    # 提取正文
     content_tag = soup.find("span", id="detailContent")
-    paragraphs = content_tag.find_all("p") if content_tag else []
-    content_html = "".join(str(p) for p in paragraphs)  # 保留p标签的html结构
+    if not content_tag:
+        print("没有找到正文")
+        return
+
+    imgs = content_tag.find_all("img")
+    media_files = []
+
+    safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
+    save_dir = os.path.join(settings.MEDIA_ROOT, "articles", safe_title)
+    os.makedirs(save_dir, exist_ok=True)
+
+    for img in imgs:
+        src = img.get("src")
+        print("原始图片 src =", src)
+        if not src:
+            continue
+        # 用文章页面url作为base拼接，确保拼出完整图片链接
+        src = urljoin(url, src)
+        print("拼接后图片 URL =", src)
+        local_path = download_media(src, save_dir)
+        if local_path:
+            rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
+            img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
+            media_files.append(rel_path.replace("\\", "/"))
+
+    content_html = str(content_tag)
 
-    # 如果文章已存在，则不重复插入
     if Article.objects.filter(url=url).exists():
         print(f"文章已存在，跳过: {url}")
         return
@@ -31,5 +78,6 @@ def crawl_xinhua_article(url, website):
         url=url,
         content=content_html,
         pub_date=timezone.now(),
+        media_files=media_files
     )
-    print(f"已保存文章：{title}")
+    print(f"已保存文章及图片：{title}")