From da1b8d98e460ed45ea4ce752a48d8f9a5981d310 Mon Sep 17 00:00:00 2001 From: yuangyaa Date: Mon, 11 Aug 2025 13:28:32 +0800 Subject: [PATCH] Add download media --- .gitignore | 8 +++ core/management/commands/__init__.py | 0 core/management/commands/crawl_xinhua.py | 2 - core/utils.py | 62 +++++++++++++++++++++--- 4 files changed, 63 insertions(+), 9 deletions(-) create mode 100644 core/management/commands/__init__.py diff --git a/.gitignore b/.gitignore index 575c1ad..dcc299e 100644 --- a/.gitignore +++ b/.gitignore @@ -174,3 +174,11 @@ cython_debug/ # PyPI configuration file .pypirc +##################################### +# +# jimmy.fang: ignore data/media/ +# +##################################### + +date/media/ + diff --git a/core/management/commands/__init__.py b/core/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/core/management/commands/crawl_xinhua.py b/core/management/commands/crawl_xinhua.py index 12b4a5f..b810425 100644 --- a/core/management/commands/crawl_xinhua.py +++ b/core/management/commands/crawl_xinhua.py @@ -6,7 +6,6 @@ class Command(BaseCommand): help = '爬取新华网文章示例' def handle(self, *args, **options): - # 假设你事先在后台建了“新华网”这个Website实例 website_name = "新华网" try: website = Website.objects.get(name=website_name) @@ -14,7 +13,6 @@ class Command(BaseCommand): self.stdout.write(self.style.ERROR(f"网站 '{website_name}' 不存在,请先后台创建")) return - # 这里写你想爬取的文章URL列表,可以循环多篇 urls = [ "https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html", ] diff --git a/core/utils.py b/core/utils.py index 2086378..02fe2ac 100644 --- a/core/utils.py +++ b/core/utils.py @@ -1,7 +1,33 @@ +import os import requests from bs4 import BeautifulSoup from django.utils import timezone -from core.models import Website, Article +from django.conf import settings +from urllib.parse import urljoin +from core.models import Article + +def download_media(url, save_dir): + try: + resp = requests.get(url, timeout=15) + resp.raise_for_status() + except Exception as e: + print(f"下载失败:{url},错误:{e}") + return None + + filename = url.split("/")[-1].split("?")[0] + os.makedirs(save_dir, exist_ok=True) + filepath = os.path.join(save_dir, filename) + + base, ext = os.path.splitext(filename) + counter = 1 + while os.path.exists(filepath): + filename = f"{base}_{counter}{ext}" + filepath = os.path.join(save_dir, filename) + counter += 1 + + with open(filepath, "wb") as f: + f.write(resp.content) + return filepath def crawl_xinhua_article(url, website): headers = { @@ -11,16 +37,37 @@ def crawl_xinhua_article(url, website): resp.encoding = 'utf-8' soup = BeautifulSoup(resp.text, "html.parser") - # 提取标题 title_tag = soup.find("span", class_="title") title = title_tag.get_text(strip=True) if title_tag else "无标题" - # 提取正文 content_tag = soup.find("span", id="detailContent") - paragraphs = content_tag.find_all("p") if content_tag else [] - content_html = "".join(str(p) for p in paragraphs) # 保留p标签的html结构 + if not content_tag: + print("没有找到正文") + return + + imgs = content_tag.find_all("img") + media_files = [] + + safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50] + save_dir = os.path.join(settings.MEDIA_ROOT, "articles", safe_title) + os.makedirs(save_dir, exist_ok=True) + + for img in imgs: + src = img.get("src") + print("原始图片 src =", src) + if not src: + continue + # 用文章页面url作为base拼接,确保拼出完整图片链接 + src = urljoin(url, src) + print("拼接后图片 URL =", src) + local_path = download_media(src, save_dir) + if local_path: + rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT) + img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/") + media_files.append(rel_path.replace("\\", "/")) + + content_html = str(content_tag) - # 如果文章已存在,则不重复插入 if Article.objects.filter(url=url).exists(): print(f"文章已存在,跳过: {url}") return @@ -31,5 +78,6 @@ def crawl_xinhua_article(url, website): url=url, content=content_html, pub_date=timezone.now(), + media_files=media_files ) - print(f"已保存文章:{title}") + print(f"已保存文章及图片:{title}")